#!/usr/bin/perl -w # ("perldoc spicylinks.pl" to read this) =head1 NAME spicylinks - make an aggregated RSS feed of the URLs that everyone's linking to =head1 SYNOPSIS spicylinks [--feeds file.opml] [--maxitems num] [--timelimit num] [--dump] =head1 DESCRIPTION A shameless copy of http://dev.upian.com/hotlinks/ , with less features and no web UI -- but with the source code, and with the ability for you to specify what feeds you want to aggregate. See C for more details. =head1 OPTIONS =over 4 =item --feeds file.opml The OPML file to load the source feed-list from. Note that only the RSS/Atom URLs for link-blogs should be listed in the file. This can be a remote http/https URL, and inclusion of items from remote sources is supported inside the OPML file for C nodes of C, C or C. =item --delnetwork delusername Load the source feed-list from a del.icio.us user's network. C is the person's username on del.icio.us, e.g. B. =item --maxitems num The maximum number of items to output in each output RSS feed. =item --timelimit num How "fresh" should an item be, for it to be included? Specified in hours. Items that don't declare a date in the source feed, are considered to have been posted right now. =item --dump Dump all (parsed) XML structures using Data::Dumper, for debugging. =back =head1 CACHING Yes, it does -- in the C<$HOME/.leechrss> directory. (This is an artifact of my other web-scraping scripts which also use that dir.) RSS files are cached for as long as C says they should be cached; see the documentation for the C C method. =head1 USAGE Here's the line I use in my "crontab" file to run the script every 30 minutes: 10,40 * * * * cd /home/jm/DIR/spicylinks \; perl spicylinks.pl --feeds feeds.opml > LOG 2>&1 Standard stuff. =head1 LICENSE GPL =head1 PREREQUISITES The following CPAN modules must be installed: WWW::Mechanize XML::Simple XML::RSS MIME::Base64 HTML::Entities Date::Parse Template optionally: Time::Duration -- to produce nice "3 hours ago" readable timestamps =head1 AUTHOR Justin Mason, C =head1 VERSION 1.4 Apr 17 2007 jm =cut use strict; use Getopt::Long; use utf8; use vars qw( $opt_dump $opt_maxitems $opt_timelimit $opt_feeds $opt_delnetwork ); GetOptions( "dump", "maxitems=s", "timelimit=s", "delnetwork=s", "feeds=s" ) or usage(); $opt_maxitems ||= 30; $opt_timelimit ||= (7 * 24); # 1 week $opt_feeds ||= "feeds.opml"; if ($opt_delnetwork) { $opt_feeds = "http://taint.org/scraped/deliciousnetwork2opml.cgi?u=".$opt_delnetwork; } # CPANish stuff use WWW::Mechanize; use XML::Simple; use XML::RSS; use MIME::Base64; use HTML::Entities; use Date::Parse; use Template; use constant HAS_TIME_DURATION => eval { require Time::Duration; }; # perl standards use Data::Dumper; use File::Path; use Pod::Usage; use Encode; # --------------------------------------------------------------------------- my %TEMPLATES_HTML; { $TEMPLATES_HTML{ALL} = q{

[% LINKS %] }; $TEMPLATES_HTML{LINK} = q{

[% LINK %] [[% COUNT %]]
First posted [% WHENRELATIVE %]

[% PEOPLE %]

(Tags: [% TAGS %])

}; $TEMPLATES_HTML{PERSON} = q{

[% WHO %] : [% TITLE %]

[% DESC %]

}; }; my %TEMPLATES_RSS; { $TEMPLATES_RSS{ALL} = ''; $TEMPLATES_RSS{LINK} = q{

[% PEOPLE %]

(Tags: [% TAGS %])

}; $TEMPLATES_RSS{PERSON} = q{

[% WHO %] : [% TITLE %] - [% DESC %]

}; }; my %TEMPLATES_TEXT; { $TEMPLATES_TEXT{ALL} = q{ [% LINKS %] }; $TEMPLATES_TEXT{LINK} = q{ [% COUNT %] -- [% LINK %] [% PEOPLE %] (Tags: [% TAGS %]) }; $TEMPLATES_TEXT{PERSON} = q{ [% WHO %] : [% TITLE %] [% DESC %] }; }; # --------------------------------------------------------------------------- my $mech = WWW::Mechanize->new(); my $cmech = mech_cache_new($mech); my $links = { }; my $tooold = 0; my @sorted_links; my $timelimit_dc_date = time_t_to_dcdate(time - ($opt_timelimit * 60 * 60)); main(); exit; sub main { if ($opt_feeds =~ /^http/i) { $opt_feeds = get_remote_opml($opt_feeds); # returns it as string die "failed to get OPML file $opt_feeds\n" unless $opt_feeds; } my $feeds = XMLin($opt_feeds); my @outlines = @{$feeds->{body}{outline}}; @outlines = resolve_opml_inclusions(0, @outlines); # uniq that list; earlier entries take priority over later my %seen = (); my @new = (); foreach my $feed (@outlines) { next if $seen{$feed->{xmlUrl}}; $seen{$feed->{xmlUrl}}++; push @new, $feed; } @outlines = @new; # do polls in alphanumeric order, so output is similarly sorted foreach my $feed (sort { ($a->{title}||'') cmp ($b->{title}||'') } @outlines) { do_feed ($feed); } # and rank them by number of links. @sorted_links = sort { @{$links->{$b}} <=> @{$links->{$a}} } keys %$links; print "found: ".scalar @sorted_links." links, $tooold too old, in ". (scalar @outlines)." feeds.\n"; foreach my $level (1 .. 5) { summarise(\%TEMPLATES_HTML, $level, "index-$level.html"); summarise(\%TEMPLATES_TEXT, $level, "index-$level.txt"); summarise_rss(\%TEMPLATES_RSS, $level, "index-$level.xml"); } } sub get_remote_opml { my ($link) = @_; my ($uri, $text) = get_with_cache ($cmech, $link); if (!$uri) { warn "ERROR: GET failed: ".$link.": ".last_get_status()."\n"; return; } return $text; } sub resolve_opml_inclusions { my ($reclevel, @outlines) = @_; die "too much recursion: $reclevel levels" if $reclevel > 5; # proposed types for OPML inclusion/transclusion are 'link', most common, see # http://www.pubsub.com/lists/community_list_opml.opml , and 'opml' or # 'include': # http://blogs.msdn.com/alexbarn/archive/2005/11/24/496769.aspx#496880 . my @newoutlines = (); foreach my $outline (@outlines) { if ($outline->{type} =~ /^(?:link|opml|include)$/i && $outline->{url} && $outline->{url} =~ /^http/i) { my $opmlstr = get_remote_opml($outline->{url}); if ($opmlstr) { my $feeds = XMLin($opmlstr); push @newoutlines, resolve_opml_inclusions(++$reclevel, @{$feeds->{body}{outline}}); } } else { push @newoutlines, $outline; } } return @newoutlines; } sub do_feed { my ($feed) = @_; my $link = $feed->{xmlUrl}; my ($uri, $text) = get_with_cache ($cmech, $link); if (!$uri) { warn "ERROR: GET failed: ".$link.": ".last_get_status()."\n"; return; } my $rss; eval { $rss = XMLin($text); if ($opt_dump) { print "debug: ".Dumper($rss); } if (defined $rss->{version} && $rss->{version} =~ /^0\.9/) { parse_rss091($feed, $rss); } elsif ($rss->{xmlns} && $rss->{xmlns} =~ /\/Atom\b/i) { parse_atom($feed, $rss); } elsif ($rss->{xmlns} && $rss->{xmlns} =~ /\/rss\/1\.0\//) { parse_rss10($feed, $rss); } elsif ($rss->{version} && $rss->{version} =~ /^2\.0$/) { parse_rss20($feed, $rss); } else { die "unknown! $feed->{title} -- $feed->{htmlUrl}\n"; } }; if ($@) { warn "ERROR: $@"; warn "ERROR: ".Dumper($rss); } } sub parse_rss091 { my ($feed, $rss) = @_; print "rss091: $feed->{title} -- $feed->{htmlUrl}\n"; foreach my $item (@{$rss->{item}}) { my $l = $item->{link}; next unless $l; add_link ($l, { title => $item->{title}, desc => $item->{description}, whopage => $feed->{htmlUrl}, who => $feed->{title} }); } } sub parse_rss10 { my ($feed, $rss) = @_; my $xmlns = parse_xmlns_attributes($rss); my $title = $feed->{title} || $feed->{htmlUrl}; print "rss10: $title -- $feed->{htmlUrl}\n"; foreach my $item (@{$rss->{item}}) { my $l = $item->{link}; next unless $l; # my $tags = tag_bag_extract($xmlns, $item); my $tags = tag_dc_subject_extract($xmlns, $item); # turn an empty hash {} into "" (del.icio.us now does this) if (!$item->{description} || scalar $item->{description} =~ /HASH/) { $item->{description} = ''; } add_link ($l, { title => $item->{title}, desc => $item->{description}, tags => $tags, when => $item->{'dc:date'}, whopage => $feed->{htmlUrl}, who => $title }); } } sub parse_rss20 { my ($feed, $rss) = @_; my $feed_pubDate = $rss->{channel}{pubDate}; if ($feed_pubDate) { $feed_pubDate = force_date_to_dcdate($feed_pubDate); } my $xmlns = parse_xmlns_attributes($rss); print "rss20: $feed->{title} -- $feed->{htmlUrl}\n"; foreach my $item (@{$rss->{channel}{item}}) { my $l = $item->{link}; next unless $l; my $tags = tag_dc_subject_extract($xmlns, $item); my $dcdate; if ($item->{'dc:date'}) { $dcdate = $item->{'dc:date'}; } elsif ($item->{pubDate}) { $dcdate = force_date_to_dcdate($item->{pubDate}); } elsif ($feed_pubDate) { # hack. http://bryanBellsLinks.weblogger.com/ needs this $dcdate = $feed_pubDate; } # this happens if the desc is empty. if (ref $item->{description} eq 'HASH') { $item->{description} = ''; } add_link ($l, { title => $item->{title}, desc => $item->{description}, tags => $tags, when => $dcdate, whopage => $feed->{htmlUrl}, who => $feed->{title} }); } } sub parse_atom { my ($feed, $rss) = @_; my $xmlns = parse_xmlns_attributes($rss); print "atom: $feed->{title} -- $feed->{htmlUrl}\n"; foreach my $k (keys %{$rss->{entry}}) { my $item = $rss->{entry}{$k}; my $l; if (ref $item->{link} eq 'ARRAY') { my %linkbyrel = (); foreach my $link (@{$item->{link}}) { $linkbyrel{$link->{rel}} = $link->{href}; } $l = $linkbyrel{related}; if (!$l) { $l = $linkbyrel{alternate}; } } else { $l = $item->{link}->{href}; } if (!$l) { warn "WARNING: atom: failed to find link, skipping: ".Dumper($item); next; } my $title = parse_atom_text_node ($item->{title}); my $summary = parse_atom_text_node ($item->{summary}); my $tags = tag_dc_subject_extract($xmlns, $item); # wtf Atom? 'published' is seen in blogmarks.net Atom feeds my $when = $item->{'dc:date'}; if ($item->{created}) { $when = $item->{created}; } elsif ($item->{modified}) { $when = $item->{modified}; } elsif ($item->{published}) { $when = $item->{published}; } # print Dumper($item->{summary}); # warn "atom: [$feed->{title}] [$l] [$title] [$summary]\n"; add_link ($l, { title => $title, desc => $summary, tags => $tags, when => $when, whopage => $feed->{htmlUrl}, who => $feed->{title} }); } } sub add_link { my ($l, $linkitem) = @_; $links->{$l} ||= [ ]; my $when = $linkitem->{when}; if (!$when) { warn "oops: no timestamp for item: $l who=$linkitem->{whopage}"; } else { # convert to time_t, and back again, so that they're all in the # GMT timezone. my $tt = str2time($when); if (!defined $tt) { warn "unparseable (by Date::Parse) date string: '$when'"; $tt = time; # whatever! } $when = time_t_to_dcdate($tt); $linkitem->{when_time_t} = $tt; if ($when lt $timelimit_dc_date) { $tooold++; return; # too old for our tastes } } # whitespace -- not wanted. sanitise the strings foreach my $tag (qw( whopage title who tags )) { next unless (defined $linkitem->{$tag}); $linkitem->{$tag} =~ s/^\s+//; $linkitem->{$tag} =~ s/\s+$//; } push (@{$links->{$l}}, $linkitem); } # --------------------------------------------------------------------------- sub parse_atom_text_node { my ($item) = @_; if (ref $item eq 'HASH') { my $t = $item->{content}; if ($item->{type} && $item->{type} =~ /text/i) { # invalid feeds use "TEXT" instead of "text/plain","text/html" et al if ($item->{mode} && $item->{mode} eq 'base64') { if (!eval { $t = decode_base64($t); 1; }) { warn "'base64' decode failed! $@ ".Dumper($item); $t = $item->{content}; } } if ($item->{mode} && $item->{mode} eq 'escaped') { if (!eval { $t = decode_entities($t); 1; }) { warn "'escaped' decode failed! $@ ".Dumper($item); $t = $item->{content}; } } else { # no-op } return $t; } else { warn "wtf! non-text item?! ".Dumper($item); return $item->{content}; } } else { return $item; } } sub parse_xmlns_attributes { my ($rss) = @_; my $xmlns = { 'taxo' => '', 'rdf' => '', }; foreach my $attr (keys %{$rss}) { next unless $attr =~ /^xmlns:(.*)$/; my $ns = $1; if ($rss->{$attr} eq 'http://purl.org/rss/1.0/modules/taxonomy/') { $xmlns->{taxo} = $ns; } elsif ($rss->{$attr} eq 'http://www.w3.org/1999/02/22-rdf-syntax-ns#') { $xmlns->{rdf} = $ns; } } return $xmlns; } sub tag_bag_extract { my $xmlns = shift; my $item = shift; my @topic_strs = (); my $topics = $item->{$xmlns->{taxo}.":topics"} ->{$xmlns->{rdf}.":Bag"} ->{$xmlns->{rdf}.":li"}; if ($topics && ref $topics eq 'ARRAY') { foreach my $li (@{$topics}) { my $t = $li->{resource}; next unless $t; $t =~ s,^http://del.icio.us/tag/,,; $t =~ s,^http://delicious.com/tag/,,; push (@topic_strs, $t); } } return join(' ', @topic_strs); } sub tag_dc_subject_extract { my $xmlns = shift; my $item = shift; my $tags = $item->{'dc:subject'}; if (!ref $tags) { return $tags; # a string } elsif (ref $tags eq 'HASH') { return keys %{$tags}; } elsif (ref $tags eq 'ARRAY') { return join(' ', @{$tags}); } else { warn "wtf! odd dc:subject?! ".Dumper($item); return ''; } } # --------------------------------------------------------------------------- sub force_date_to_dcdate { # 'pubDate' => 'Fri, 07 Oct 2005 15:45:01 +0100' # 'dc:date' => '2005-10-05T03:47:56Z' my ($pub) = @_; my $tt = str2time($pub); # parses all sorts of funkiness return time_t_to_dcdate($tt); } sub time_t_to_dcdate { # 'dc:date' => '2005-10-05T03:47:56Z' my ($tt) = @_; if (!defined $tt) { return; } my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime($tt); return sprintf("%04d-%02d-%02dT%02d:%02d:%02dZ", $year+1900, $mon+1, $mday, $hour, $min, $sec); } my %ago_cache; sub my_ago { my $when = shift; if (!$when) { return $when; } if (!$ago_cache{$when}) { my $ago; if (HAS_TIME_DURATION) { my $tt = str2time($when); $tt ||= time; $ago = eval { Time::Duration::ago(time - $tt); }; # warn $@ if $@; } if (!defined $ago) { $ago = $when; # no luck? fall back to the ugly version } $ago_cache{$when} = $ago; } return $ago_cache{$when}; } # --------------------------------------------------------------------------- sub summarise { my $templates = shift; my $level = shift; my $outputfile = shift; my $tmpl = Template->new; my $alllinksout = ''; my $numitems = 0; foreach my $item (@sorted_links) { my $count = scalar @{$links->{$item}}; next unless ($count >= $level); my $linkout = ''; my $alltags = ''; my $earliest = ''; foreach my $ent (@{$links->{$item}}) { my $tags = ($ent->{tags} || ''); $alltags .= " ".$tags; my $when = $ent->{when}; if ($when && (!$earliest || $when lt $earliest)) { $earliest = $when; } $tmpl->process(\$templates->{PERSON}, { WHO => $ent->{who}, WHOPAGE => $ent->{whopage}, TITLE => ($ent->{title} || ''), DESC => ($ent->{desc} || ''), WHEN => $when, WHENRELATIVE => my_ago($when), TAGS => $tags, }, \$linkout) or die $tmpl->error(); } # uniq the tag list my %u=(); my $uniqtags = join ' ', sort grep {defined} map { if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; } } split ' ', $alltags; undef %u; $tmpl->process(\$templates->{LINK}, { COUNT => $count, LINK => $item, PEOPLE => $linkout, WHEN => $earliest, WHENRELATIVE => my_ago($earliest), TAGS => $uniqtags, }, \$alllinksout) or die $tmpl->error(); last if $numitems++ > $opt_maxitems; } $tmpl->process(\$templates->{ALL}, { LINKS => $alllinksout }, $outputfile) or die $tmpl->error(); print "writing: $outputfile\n"; } sub summarise_rss { my $templates = shift; my $level = shift; my $outputfile = shift; my $tmpl = Template->new; my $rss = new XML::RSS(version => '1.0'); $rss->channel ( title => "Spicylinks - Level $level", link => "http://taint.org/spicylinks/", description => "Spicylinks - Level $level", dc => { language => 'en-us', publisher => 'jm-spicylinks@jmason.org' } ); foreach my $item (@sorted_links) { my $count = scalar @{$links->{$item}}; next unless ($count >= $level); my $itemout = ''; my $linkout = ''; my $alltags = ''; my $earliest = ''; my $latest = ''; my $toptitle = ''; foreach my $ent (@{$links->{$item}}) { my $tags = ($ent->{tags} || ''); $alltags .= " ".$tags; my $when = $ent->{when}; if ($when) { if (!$earliest || $when lt $earliest) { $earliest = $when; } if (!$latest || $when gt $latest) { $latest = $when; } } if (!$toptitle) { $toptitle = $ent->{title}; } $tmpl->process(\$templates->{PERSON}, { WHO => $ent->{who}, WHOPAGE => $ent->{whopage}, TITLE => ($ent->{title} || ''), DESC => ($ent->{desc} || ''), WHEN => $when, WHENRELATIVE => my_ago($when), TAGS => $tags, }, \$linkout) or die $tmpl->error(); } # uniq the tag list my %u=(); my $uniqtags = join ' ', sort grep {defined} map { if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; } } split ' ', $alltags; undef %u; my $when = $latest; $tmpl->process(\$templates->{LINK}, { COUNT => $count, LINK => $item, PEOPLE => $linkout, WHEN => $when, WHENRELATIVE => my_ago($when), TAGS => $uniqtags, }, \$itemout) or die $tmpl->error(); $rss->add_item( title => ($toptitle || '(Untitled)'), link => $item, description => $itemout, dc => { date => $when, subject => $uniqtags }, ); } open (OUT, ">$outputfile") or die "failed to write to $outputfile"; eval { binmode OUT, ":encoding(utf8)"; }; print OUT $rss->as_string; close OUT or die "failed to write to $outputfile"; print "writing: $outputfile\n"; } # --------------------------------------------------------------------------- sub mech_cache_new { my ($mech) = @_; my $cache = $ENV{HOME}."/.leechrss/cache"; (-d $cache) or mkpath ($cache); my $self = { mech => $mech, cache => $cache }; $self; } sub get_with_cache { my ($self, $link) = @_; my ($status, $uri, $content) = cache_get ($self, $link); if ($uri) { warn "cached: $link\n"; $self->{last_status} = $status; return ($uri, $content, 0); } else { warn "get: $link\n"; eval { my $resp = $self->{mech}->get ($link); $uri = $self->{mech}->uri; $content = Encode::decode_utf8 $self->{mech}->content; $self->{last_status} = $self->{mech}->status; my $expires = int $resp->fresh_until(); cache_put ($self, $link, $uri, $content, $expires, $self->{last_status}); 1; } or warn $@; if ($self->{last_status} =~ /^2/) { return ($uri, $content, 1); } } return; } sub last_get_status { my ($self) = @_; return $self->{last_status}; } sub cache_put { my ($self, $link, $uri, $content, $expires, $status) = @_; my $cfile = $link; $cfile =~ s/[^A-Za-z0-9_]/_/gs; $cfile = "$self->{cache}/$cfile"; open (OUT, ">$cfile") or die "cannot write $cfile"; eval { binmode OUT, ":encoding(utf8)"; }; print OUT "Status: $status\n"; print OUT "Expires: $expires\n"; if (defined $uri) { print OUT "URI: $uri\n"; } print OUT "\n"; print OUT $content; close OUT; } sub cache_get { my ($self, $link) = @_; my $cfile = $link; $cfile =~ s/[^A-Za-z0-9_]/_/gs; $cfile = "$self->{cache}/$cfile"; open (IN, "<$cfile") or return (); my $uri; my $status; my $expires; while () { s/\r?\n$//s; /^URI: (.*)$/ and $uri = $1; /^Status: (.*)$/ and $status = $1; /^Expires: (.*)$/ and $expires = $1; /^$/ and last; } my $content = join ('', ); close IN; if (time > $expires) { # the cached copy has expired. return nothing return (); } return ($status, $uri, $content); } # --------------------------------------------------------------------------- sub usage { pod2usage( -verbose => 0, -exitval => 1 ); } 1;