Skip to content Skip to navigation

Creating an RSS Feed from a Facebook Wall

« previous next »

I'm not the first one to run into this problem, but in searching around I couldn't actually find anyone who had actually bothered to solve the problem: how to get an RSS feed from a publicly accessible facebook wall. The facebook wall can either be a fan-page wall or a publicly accessible group's wall.

In my case I wanted to pull posts from various fan-page and group walls into a Drupal aggregator page. The following perl script does the job by scraping through the HTML for certain key id names and class names in facebook's mark-up and extracting the text and turning it into a valid RSS feed.

The script is called with the following syntax (assuming the file is named "facebookrss.pl"):

For pages with the standard URL syntax:
http://[yourserver]/[yourpath]/facebookrss.pl

?url=/pages/pagename/[numericpageid]
&title=[your desired RSS feed title]&description=[your desired RSS feed description]

eg:
http://[yourserver]/[yourpath]/facebookrss.pl?url=/pages/Duct-Tape/91090174601&title=Duct%20Tape&description=For%20all%20actual%20repairs

For pages with "friendly" URL's:
http://[yourserver]/[yourpath]/facebookrss.pl?url=/[friendly name]&title=[your desired RSS feed title]&description=[your desired RSS feed description]

eg:
http://[yourserver]/[yourpath]/facebookrss.pl?url=/RedGreen&title=Red%20Green&description=If%20they%20don't%20find%20you%20handsome%20they%20better%20find%20you%20handy

For groups:
http://[yourserver]/[yourpath]/facebookrss.pl?url=/group.php?gid=[numericgroupid]&title=[your desired RSS feed title]&description=[your desired RSS feed description]

eg:
http://[yourserver]/[yourpath]/facebookrss.pl?url=/group.php?gid=2223250628&title=Duct%20Tape&description=For%20all%20actual%20repairs

Here's the code (be careful about extraneous line breaks caused by the browser - they should be fairly obvious):

#!/usr/bin/perl -w
use CGI qw(:standard);
use LWP::Simple qw(getstore);
use LWP::UserAgent;
use HTML::TokeParser::Simple;
use Time::Format qw(%time %strftime %manip);
use Date::Manip qw(ParseDate UnixDate DateCalc);
use HTML::Entities;
use URI::Escape;
use Encode;

$true = 1;
$false = !1;

#set user agent
$ua = LWP::UserAgent->new;
$ua->agent("Mozilla/2.02E (Win95; U)");

print header ('text/xml');
#print header ('text/plain'); #debug

$title = param("title");
$base = "http://www.facebook.com";
$suffixToken = "?";
$suffix = "v=wall";
$url = $base.param("url");
if ($url =~ /group\.php/) {
 $url = uri_unescape($url);
 $suffixToken = "&";
}
$url .= $suffixToken.$suffix;
$description = param("description");
$pubDate = 0;

#retrieve the page
$file = "";
$request = HTTP::Request->new(GET => $url);
$response = $ua->request($request);
#fix encoding if HTTP response header doesn't match (shouldn't matter with FB though)
if($response->is_success) {
 $content = $response->content;
 $encoding = "utf8"; # assume this is the default
 if($content =~ /encoding="([^"]+)"/) {
   $encoding = $1;
 }
 $file = $response->decoded_content((charset => $encoding));
}

#properly encode for RSS
$encTitle=HTML::Entities::encode($title);
$encUrl=HTML::Entities::encode($url);
$encDescription=HTML::Entities::encode($description);

# begin RSS output
print "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>
<rss version=\"2.0\">
<channel>
<title>$encTitle</title>
<link>$encUrl</link>
<description>$encDescription
</description>
";

# Parse HTML and pull entires
$p = HTML::TokeParser::Simple->new(\$file);

my $output = "";
my $count = 0;
%items = ();
$pubDate = "1/1/1970";
$realTimeText = "1/1/1970";

while (my $token = $p->get_token) {
 if (@$token[0] eq "S") {
   #when we hit div class used by FB for all wall posts it's time to get busy
   if (($token->as_is =~ /UIIntentionalStory/)) {
     $content = "";
     $firstMediaItem = $true;
     $realTimeFlag $false;
     $token = $p->get_token;
     while  ( not( $token->as_is =~ /id="commentable_item_|UIActionLinks/)  ) {
       #put a space in whenever we hit a closign div tag to avoid mashing text together
       if ($token->is_end_tag("div")) {
         $content .= " ";
       }
       #add a newline before "attachments"
       if ($token->is_start_tag("div") && ($token->as_is =~ /UIStoryAttachment/) && $firstMediaItem) {
         $firstMediaItem = $false;
         $content .= "<br />";
       }
       #add a newline before the content of the wall post
       if ($token->is_start_tag("span") && ($token->as_is =~ /UIStory_Message/)) { #
         $content .= "<br />";
       }
       #skip over tags we want to ignore and strip out useless attributes
       if ( not($token->is_end_tag("h3")) &&
            not($token->is_tag("span")) &&
            not($token->is_tag("div")) ) {
         $token->delete_attr("onclick");
         $token->delete_attr("class");
         $token->delete_attr("id");
         $token->delete_attr("data-ft");
         #if the URL in a link is a relative link to a photo, make it absolute
         if  ($token->is_start_tag("a")) {
           $href = $token->get_attr("href");
           if ($href) {
             $href =~ s/\/photo\.php/http:\/\/www.facebook.com\/photo\.php/;
             $token->set_attr("href",$href);
           }
         }
         #hacking a little spacing aroung the profile image
         #even though presentation *should* be completely separate from content
         if  ($token->is_start_tag("img")) {
           $href = $token->get_attr("src");
           if ($href =~ /http:\/\/profile/) {
             $token->set_attr("align","left");
             $token->set_attr("hspace","10");
             $token->set_attr("class","picture");
             $token->set_attr("style","margin-right: 10px !important;");
           }
         }
       #Skip over "read more" tokens otherwise keep the token as-is
       if ($token->as_is =~ /^read more$/i) {
         $content .= "";
       } else {
         $content .= $token->as_is;
       }
       } #end if ( not($token->is_end_tag("h3")) &&...
       $token $p->get_token;
     } #end  while  ( not( $token->as_is =~ /id="commentable_item_|UIActionLinks/)
     $token = $p->get_token;
     while ( not( $token->is_text )) {
       $token $p->get_token;
       #check for a real timestamp
       if ($token->is_tag("abbr") && ($token->as_is =~ /timestamp/i)) {
         $realTimeFlag = $true;
         $realTimeText = $token->get_attr("title");
       }
     }
     #if we have a real timestamp, use it, otherwise parse the time text
     if ($realTimeFlag) {
     $timeText = $realTimeText;
     } else {
       $timeText = $token->as_is;
         if ($timeText =~ /about/) {
           $timeText =~ s/\sat/,/;
         }
     }
     my $date = ParseDate($timeText);    
     my $epochDate UnixDate($date, "%s");
     # parsing the text sometimes means it will be a week off,
     # i.e. *next* Friday instead of *last* Friday
     # so if we get a future date, subtract a week
     if ($epochDate > time()) {
             $epochDate = $epochDate - 604800;
             $date = DateCalc($date, "-1 week");
     }
     my $epochPubDate = UnixDate($pubDate, "%s");
     # keep track of the newest post, this will be our lastBuildDate
     if ($epochDate > $epochPubDate) {$pubDate = $date};
     my $dateText = UnixDate($date, "%a, %e %b %Y %H:%M:%S %Z");
     my $link = $url;
     my $linkText = "View wall";
     my $title = "New wall post";
     if ($content =~ /(\/note\.php.+?)"/) {
       $link = "http://www.facebook.com".$1;
       $title "New note";
       $linkText = "View original note";
     }
     if ($content =~ /(\/album\.php.+?)"/) {
       $link = "http://www.facebook.com".$1;
       $title "New photos";
       $linkText = "View album";
     }
     
     $content .= "<small><br /><a href=\"$link\">$linkText</a><br /></small>";
     
     #use the epoch time the key for a hash of hashes that we can sort later
     $items{$epochDate}{Link} = HTML::Entities::encode($link);
     $items{$epochDate}{Text} = HTML::Entities::encode($title);
     $items{$epochDate}{Description} = "<![CDATA[".$content."]]>";
     $items{$epochDate}{Date} = $dateText ;
   } #end if (($token->as_is =~ /UIIntentionalStory/))
 } else {
   next;
 } #end if (@$token[0] eq "S")
}

# output the rest of the RSS
$pubDateText = UnixDate($pubDate, "%a, %e %b %Y %H:%M:%S %Z");
print "<lastBuildDate>$pubDateText</lastBuildDate>\n";

#sort the posts by last modified time using the epoch times
@itemKeys = reverse sort keys %items;

#write out well formed items, now in the right order
foreach $epoch (@itemKeys) {
 my $item = $items{$epoch};
 print "<item>\n";
 print "   <title>";
 print $items{$epoch}{Text};
 print "</title>\n";
 print "   <description>";
 print $items{$epoch}{Description};
 print "</description>\n";
 print "   <pubDate>";
 print $items{$epoch}{Date};
 print "</pubDate>\n";
 print "   <link>";
 print $items{$epoch}{Link};
 print "</link>\n";
 print "</item>\n";
}

# End of the RSS feed
print "
</channel>
</rss>
";

exit (0);