#!/usr/bin/perl -w
# ("perldoc spicylinks.pl" to read this)
=head1 NAME

spicylinks - make an aggregated RSS feed of the URLs that everyone's linking to

=head1 SYNOPSIS

spicylinks [--feeds file.opml] [--maxitems num] [--timelimit num] [--dump]

=head1 DESCRIPTION

A shameless copy of http://dev.upian.com/hotlinks/ , with less features and no
web UI -- but with the source code, and with the ability for you to specify
what feeds you want to aggregate.

See C<http://taint.org/wk/SpicyLinks> for more details.

=head1 OPTIONS

=over 4

=item --feeds file.opml

The OPML file to load the source feed-list from.   Note that only the RSS/Atom
URLs for link-blogs should be listed in the file.  This can be a remote
http/https URL, and inclusion of items from remote sources is supported
inside the OPML file for C<outline> nodes of C<type='link'>, C<type='opml'>
or C<type='include'>.

=item --delnetwork delusername

Load the source feed-list from a del.icio.us user's network.  C<delusername>
is the person's username on del.icio.us, e.g. B<jm>.

=item --maxitems num

The maximum number of items to output in each output RSS feed.

=item --timelimit num

How "fresh" should an item be, for it to be included?  Specified in hours.
Items that don't declare a date in the source feed, are considered to have been
posted right now.

=item --dump

Dump all (parsed) XML structures using Data::Dumper, for debugging.

=back

=head1 CACHING

Yes, it does -- in the C<$HOME/.leechrss> directory.  (This is an artifact of
my other web-scraping scripts which also use that dir.)

RSS files are cached for as long as C<LWP> says they should be cached;
see the documentation for the C<HTTP::Response> C<freshness_lifetime>
method.

=head1 USAGE

Here's the line I use in my "crontab" file to run the script every
30 minutes:

  10,40 * * * * cd /home/jm/DIR/spicylinks \; perl spicylinks.pl --feeds feeds.opml > LOG 2>&1

Standard stuff.

=head1 LICENSE

GPL

=head1 PREREQUISITES

The following CPAN modules must be installed:

  WWW::Mechanize XML::Simple XML::RSS MIME::Base64 HTML::Entities Date::Parse
  Template 
  
optionally:

  Time::Duration -- to produce nice "3 hours ago" readable timestamps

=head1 AUTHOR

Justin Mason, C<jm at jmason dot org>

=head1 VERSION

1.4 Apr 17 2007 jm 

=cut

use strict;
use Getopt::Long;
use utf8;

use vars qw(
    $opt_dump $opt_maxitems $opt_timelimit $opt_feeds $opt_delnetwork
);
GetOptions(
    "dump",
    "maxitems=s",
    "timelimit=s",
    "delnetwork=s",
    "feeds=s"
) or usage();

$opt_maxitems   ||= 30;
$opt_timelimit  ||= (7 * 24);      # 1 week
$opt_feeds      ||= "feeds.opml";

if ($opt_delnetwork) {
  $opt_feeds = "http://taint.org/scraped/deliciousnetwork2opml.cgi?u=".$opt_delnetwork;
}

# CPANish stuff
use WWW::Mechanize;
use XML::Simple;
use XML::RSS;
use MIME::Base64;
use HTML::Entities;
use Date::Parse;
use Template;
use constant HAS_TIME_DURATION => eval { require Time::Duration; };

# perl standards
use Data::Dumper;
use File::Path;
use Pod::Usage;
use Encode;

# ---------------------------------------------------------------------------

my %TEMPLATES_HTML;
{
  $TEMPLATES_HTML{ALL} = q{
    <html><head>

      <style type="text/css" media="all">

        p.sl_person_desc {
          color: #666;
          font-size: 50%;
          text-align: right;
        }
        span.sl_title: {
          font-weight: bold;
          border: 3px solid #aaa;
          background: #eec;
        }
        li.sl_person_li {
          border: 1px solid #aaa;
          margin: 2px;
          padding: 0px 5px 0px 5px;
          background: #eec;
        }
        p.sl_taglist {
          color: #999;
          font-size: 50%;
          text-align: right;
        }
        li.sl_links_li {
          margin: 20px 5px 20px 5px;
          padding: 0px 10px 0px 10px;
        }
        a.sl_href {
          font-size: 75%;
          font-weight: bold;
          line-height: 2em;
          color: #d00;
          background: #ddd;
          border: 1px dashed;
          margin: 20px 5px 20px 5px;
          padding: 5px 5px 5px 5px;
        }
        span.sl_count {
          color: #999;
          font-size: 50%;
        }
        span.sl_firstpost {
          color: #999;
          font-size: 50%;
        }

      </style>
    </head><body>

    <ul class=sl_links_ul>
    [% LINKS %]
    </ul>
    </body>
    </html>
  };

  $TEMPLATES_HTML{LINK} = q{
    <li class=sl_links_li>
      <a href="[% LINK %]" class=sl_href>[% LINK %]</a>
        <span class=sl_count>[[% COUNT %]]</span><br>
        <span class=sl_firstpost>First posted [% WHENRELATIVE %]</span>
      <ul class=sl_people>
        [% PEOPLE %]
      </ul>
      <p class=sl_taglist>(Tags: [% TAGS %])</p>
    </li>
  };

  $TEMPLATES_HTML{PERSON} = q{
    <li class=sl_person_li>
      <p class=sl_person>
          <a href="[% WHOPAGE %]" class=sl_whopage_href>[% WHO %]</a> :
              <span class=sl_title>[% TITLE %]</span></p>
      <p class=sl_person_desc>[% DESC %]</p>
    </li>
  };
};

my %TEMPLATES_RSS;

{
  $TEMPLATES_RSS{ALL} = '';

  $TEMPLATES_RSS{LINK} = q{
    <ul>
      [% PEOPLE %]
    </ul>
    <p>(Tags: [% TAGS %])</p>
  };

  $TEMPLATES_RSS{PERSON} = q{
    <li>
      <p>
          <a href="[% WHOPAGE %]">[% WHO %]</a> :
              <span class=sl_title>[% TITLE %]</span> -
              <em><span class=sl_person_desc>[% DESC %]</span></em>
      </p>
    </li>
  };
};

my %TEMPLATES_TEXT;

{
  $TEMPLATES_TEXT{ALL} = q{ [% LINKS %] };

  $TEMPLATES_TEXT{LINK} = q{
  [% COUNT %] -- [% LINK %]
  [% PEOPLE %]
    (Tags: [% TAGS %])
  };

  $TEMPLATES_TEXT{PERSON} = q{ 
    [% WHO %] : [% TITLE %]
    [% DESC %]
  };
};

# ---------------------------------------------------------------------------

my $mech = WWW::Mechanize->new();
my $cmech = mech_cache_new($mech);

my $links = { };
my $tooold = 0;
my @sorted_links;
my $timelimit_dc_date = time_t_to_dcdate(time - ($opt_timelimit * 60 * 60));

main();
exit;

sub main {
  if ($opt_feeds =~ /^http/i) {
    $opt_feeds = get_remote_opml($opt_feeds);   # returns it as string
    die "failed to get OPML file $opt_feeds\n" unless $opt_feeds;
  }

  my $feeds = XMLin($opt_feeds);
  my @outlines = @{$feeds->{body}{outline}};
  @outlines = resolve_opml_inclusions(0, @outlines);

  # uniq that list; earlier entries take priority over later
  my %seen = ();
  my @new = ();
  foreach my $feed (@outlines) {
    next if $seen{$feed->{xmlUrl}};
    $seen{$feed->{xmlUrl}}++;
    push @new, $feed;
  }
  @outlines = @new;

  # do polls in alphanumeric order, so output is similarly sorted
  foreach my $feed (sort {
              ($a->{title}||'') cmp ($b->{title}||'')
          } @outlines)
  {
    do_feed ($feed);
  }

  # and rank them by number of links.
  @sorted_links = sort {
            @{$links->{$b}} <=> @{$links->{$a}}
          } keys %$links;

  print "found: ".scalar @sorted_links." links, $tooold too old, in ".
        (scalar @outlines)." feeds.\n";

  foreach my $level (1 .. 5) {
    summarise(\%TEMPLATES_HTML, $level, "index-$level.html");
    summarise(\%TEMPLATES_TEXT, $level, "index-$level.txt");
    summarise_rss(\%TEMPLATES_RSS,  $level, "index-$level.xml");
  }
}

sub get_remote_opml {
  my ($link) = @_;

  my ($uri, $text) = get_with_cache ($cmech, $link);
  if (!$uri) {
    warn "ERROR: GET failed: ".$link.": ".last_get_status()."\n";
    return;
  }

  return $text;
}

sub resolve_opml_inclusions {
  my ($reclevel, @outlines) = @_;

  die "too much recursion: $reclevel levels" if $reclevel > 5;

  # proposed types for OPML inclusion/transclusion are 'link', most common, see
  # http://www.pubsub.com/lists/community_list_opml.opml , and 'opml' or
  # 'include':
  # http://blogs.msdn.com/alexbarn/archive/2005/11/24/496769.aspx#496880 .

  my @newoutlines = ();
  foreach my $outline (@outlines)
  {
    if ($outline->{type} =~ /^(?:link|opml|include)$/i
        && $outline->{url}
        && $outline->{url} =~ /^http/i)
    {
      my $opmlstr = get_remote_opml($outline->{url});
      if ($opmlstr) {
        my $feeds = XMLin($opmlstr);
        push @newoutlines, resolve_opml_inclusions(++$reclevel,
                @{$feeds->{body}{outline}});
      }

    } else {
      push @newoutlines, $outline;
    }
  }

  return @newoutlines;
}

sub do_feed {
  my ($feed) = @_;

  my $link = $feed->{xmlUrl};
  my ($uri, $text) = get_with_cache ($cmech, $link);
  if (!$uri) {
    warn "ERROR: GET failed: ".$link.": ".last_get_status()."\n";
    return;
  }

  my $rss;
  eval {
    $rss = XMLin($text);

    if ($opt_dump) {
      print "debug: ".Dumper($rss);
    }

    if (defined $rss->{version} && $rss->{version} =~ /^0\.9/) {
      parse_rss091($feed, $rss);
    }
    elsif ($rss->{xmlns} && $rss->{xmlns} =~ /\/Atom\b/i) {
      parse_atom($feed, $rss);
    }
    elsif ($rss->{xmlns} && $rss->{xmlns} =~ /\/rss\/1\.0\//) {
      parse_rss10($feed, $rss);
    }
    elsif ($rss->{version} && $rss->{version} =~ /^2\.0$/) {
      parse_rss20($feed, $rss);
    }
    else {
      die "unknown! $feed->{title} -- $feed->{htmlUrl}\n";
    }
  };

  if ($@) {
    warn "ERROR: $@";
    warn "ERROR: ".Dumper($rss);
  }
}

sub parse_rss091 {
  my ($feed, $rss) = @_;

  print "rss091: $feed->{title} -- $feed->{htmlUrl}\n";
  foreach my $item (@{$rss->{item}}) {
    my $l = $item->{link};
    next unless $l;

    add_link ($l, {
        title => $item->{title},
        desc => $item->{description},
        whopage => $feed->{htmlUrl},
        who => $feed->{title}
      });
  }
}

sub parse_rss10 {
  my ($feed, $rss) = @_;

  my $xmlns = parse_xmlns_attributes($rss);

  my $title = $feed->{title} || $feed->{htmlUrl};
  print "rss10: $title -- $feed->{htmlUrl}\n";
  foreach my $item (@{$rss->{item}}) {
    my $l = $item->{link};
    next unless $l;

    # my $tags = tag_bag_extract($xmlns, $item);
    my $tags = tag_dc_subject_extract($xmlns, $item);

    # turn an empty hash {} into "" (del.icio.us now does this)
    if (!$item->{description} || scalar $item->{description} =~ /HASH/) {
      $item->{description} = '';
    }

    add_link ($l, {
        title => $item->{title},
        desc => $item->{description},
        tags => $tags,
        when => $item->{'dc:date'},
        whopage => $feed->{htmlUrl},
        who => $title
      });
  }
}

sub parse_rss20 {
  my ($feed, $rss) = @_;

  my $feed_pubDate = $rss->{channel}{pubDate};
  if ($feed_pubDate) {
    $feed_pubDate = force_date_to_dcdate($feed_pubDate);
  }

  my $xmlns = parse_xmlns_attributes($rss);
  print "rss20: $feed->{title} -- $feed->{htmlUrl}\n";
  foreach my $item (@{$rss->{channel}{item}}) {
    my $l = $item->{link};
    next unless $l;
    my $tags = tag_dc_subject_extract($xmlns, $item);

    my $dcdate;
    if ($item->{'dc:date'}) {
      $dcdate = $item->{'dc:date'};
    }
    elsif ($item->{pubDate}) {
      $dcdate = force_date_to_dcdate($item->{pubDate});
    }
    elsif ($feed_pubDate) {
      # hack. http://bryanBellsLinks.weblogger.com/ needs this
      $dcdate = $feed_pubDate;
    }

    # this happens if the desc is empty.
    if (ref $item->{description} eq 'HASH') {
      $item->{description} = '';
    }

    add_link ($l, {
        title => $item->{title},
        desc => $item->{description},
        tags => $tags,
        when => $dcdate,
        whopage => $feed->{htmlUrl},
        who => $feed->{title}
      });

  }
}

sub parse_atom {
  my ($feed, $rss) = @_;

  my $xmlns = parse_xmlns_attributes($rss);
  print "atom: $feed->{title} -- $feed->{htmlUrl}\n";
  foreach my $k (keys %{$rss->{entry}}) {
    my $item = $rss->{entry}{$k};
    my $l;

    if (ref $item->{link} eq 'ARRAY') {
      my %linkbyrel = ();
      foreach my $link (@{$item->{link}}) {
        $linkbyrel{$link->{rel}} = $link->{href};
      }
      $l = $linkbyrel{related};
      if (!$l) { $l = $linkbyrel{alternate}; }
    }
    else {
      $l = $item->{link}->{href};
    }

    if (!$l) {
      warn "WARNING: atom: failed to find link, skipping: ".Dumper($item);
      next;
    }

    my $title = parse_atom_text_node ($item->{title});
    my $summary = parse_atom_text_node ($item->{summary});
    my $tags = tag_dc_subject_extract($xmlns, $item);

    # wtf Atom?  'published' is seen in blogmarks.net Atom feeds
    my $when = $item->{'dc:date'};
    if ($item->{created}) { $when = $item->{created}; }
    elsif ($item->{modified}) { $when = $item->{modified}; }
    elsif ($item->{published}) { $when = $item->{published}; }

    # print Dumper($item->{summary});
    # warn "atom: [$feed->{title}] [$l] [$title] [$summary]\n";

    add_link ($l, {
        title => $title,
        desc => $summary,
        tags => $tags,
        when => $when,
        whopage => $feed->{htmlUrl},
        who => $feed->{title}
      });
  }
}

sub add_link {
  my ($l, $linkitem) = @_;

  $links->{$l} ||= [ ];

  my $when = $linkitem->{when};
  if (!$when) {
    warn "oops: no timestamp for item: $l who=$linkitem->{whopage}";
  } 
  else {
    # convert to time_t, and back again, so that they're all in the
    # GMT timezone.

    my $tt = str2time($when);
    if (!defined $tt) {
      warn "unparseable (by Date::Parse) date string: '$when'";
      $tt = time;           # whatever!
    }

    $when = time_t_to_dcdate($tt);
    $linkitem->{when_time_t} = $tt;

    if ($when lt $timelimit_dc_date) {
      $tooold++;
      return;         # too old for our tastes
    }
  }

  # whitespace -- not wanted.  sanitise the strings
  foreach my $tag (qw(
                whopage title who tags
            ))
  {
    next unless (defined $linkitem->{$tag});
    $linkitem->{$tag} =~ s/^\s+//;
    $linkitem->{$tag} =~ s/\s+$//;
  }

  push (@{$links->{$l}}, $linkitem);
}

# ---------------------------------------------------------------------------

sub parse_atom_text_node {
  my ($item) = @_;
  if (ref $item eq 'HASH') {
    my $t = $item->{content};

    if ($item->{type} && $item->{type} =~ /text/i) {
      # invalid feeds use "TEXT" instead of "text/plain","text/html" et al
      if ($item->{mode} && $item->{mode} eq 'base64') {
        if (!eval { $t = decode_base64($t); 1; })
        {
          warn "'base64' decode failed! $@ ".Dumper($item);
          $t = $item->{content};
        }
      }
      if ($item->{mode} && $item->{mode} eq 'escaped') {
        if (!eval { $t = decode_entities($t); 1; })
        {
          warn "'escaped' decode failed! $@ ".Dumper($item);
          $t = $item->{content};
        }
      }
      else {
        # no-op
      }
      return $t;
    }
    else {
      warn "wtf! non-text item?! ".Dumper($item);
      return $item->{content};
    }
  }
  else {
    return $item;
  }
}

sub parse_xmlns_attributes {
  my ($rss) = @_;

  my $xmlns = { 
    'taxo' => '',
    'rdf' => '',
  };

  foreach my $attr (keys %{$rss}) {
    next unless $attr =~ /^xmlns:(.*)$/;
    my $ns = $1;
    if ($rss->{$attr} eq 'http://purl.org/rss/1.0/modules/taxonomy/') {
      $xmlns->{taxo} = $ns;
    }
    elsif ($rss->{$attr} eq 'http://www.w3.org/1999/02/22-rdf-syntax-ns#') {
      $xmlns->{rdf} = $ns;
    }
  }

  return $xmlns;
}

sub tag_bag_extract {
  my $xmlns = shift;
  my $item = shift;

  my @topic_strs = ();
  my $topics = $item->{$xmlns->{taxo}.":topics"}
                    ->{$xmlns->{rdf}.":Bag"}
                    ->{$xmlns->{rdf}.":li"};

  if ($topics && ref $topics eq 'ARRAY') {
    foreach my $li (@{$topics}) {
      my $t = $li->{resource};
      next unless $t;
      $t =~ s,^http://del.icio.us/tag/,,;
      $t =~ s,^http://delicious.com/tag/,,;
      push (@topic_strs, $t);
    }
  }
  return join(' ', @topic_strs);
}

sub tag_dc_subject_extract {
  my $xmlns = shift;
  my $item = shift;

  my $tags = $item->{'dc:subject'};
  if (!ref $tags) {
    return $tags;       # a string
  }
  elsif (ref $tags eq 'HASH') {
    return keys %{$tags};
  }
  elsif (ref $tags eq 'ARRAY') {
    return join(' ', @{$tags});
  }
  else {
    warn "wtf! odd dc:subject?! ".Dumper($item);
    return '';
  }
}

# ---------------------------------------------------------------------------

sub force_date_to_dcdate {
  # 'pubDate' => 'Fri, 07 Oct 2005 15:45:01 +0100'
  # 'dc:date' => '2005-10-05T03:47:56Z'
  my ($pub) = @_;

  my $tt = str2time($pub);  # parses all sorts of funkiness
  return time_t_to_dcdate($tt);
}

sub time_t_to_dcdate {
  # 'dc:date' => '2005-10-05T03:47:56Z'
  my ($tt) = @_;
  if (!defined $tt) { return; }
  my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime($tt);
  return sprintf("%04d-%02d-%02dT%02d:%02d:%02dZ",
        $year+1900, $mon+1, $mday, $hour, $min, $sec);
}

my %ago_cache;
sub my_ago {
  my $when = shift;

  if (!$when) { return $when; }

  if (!$ago_cache{$when}) {
    my $ago;

    if (HAS_TIME_DURATION) {
      my $tt = str2time($when);
      $tt ||= time;
      $ago = eval { Time::Duration::ago(time - $tt); };
      # warn $@ if $@;
    }

    if (!defined $ago) {
      $ago = $when;       # no luck? fall back to the ugly version
    }

    $ago_cache{$when} = $ago;
  }

  return $ago_cache{$when};
}

# ---------------------------------------------------------------------------

sub summarise {
  my $templates = shift;
  my $level = shift;
  my $outputfile = shift;

  my $tmpl = Template->new;
  my $alllinksout = '';
  my $numitems = 0;

  foreach my $item (@sorted_links) {
    my $count = scalar @{$links->{$item}};
    next unless ($count >= $level);

    my $linkout = '';
    my $alltags = '';
    my $earliest = '';
    foreach my $ent (@{$links->{$item}}) {
      my $tags = ($ent->{tags} || '');
      $alltags .= " ".$tags;

      my $when = $ent->{when};
      if ($when && (!$earliest || $when lt $earliest)) {
        $earliest = $when;
      }

      $tmpl->process(\$templates->{PERSON}, {
          WHO => $ent->{who},
          WHOPAGE => $ent->{whopage},
          TITLE => ($ent->{title} || ''),
          DESC => ($ent->{desc} || ''),
          WHEN => $when,
          WHENRELATIVE => my_ago($when),
          TAGS => $tags,
        }, \$linkout) or die $tmpl->error();
    }

    # uniq the tag list
    my %u=();
    my $uniqtags = join ' ', sort grep {defined} map {
          if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; }
        } split ' ', $alltags;
    undef %u;

    $tmpl->process(\$templates->{LINK}, {
        COUNT => $count,
        LINK => $item,
        PEOPLE => $linkout,
        WHEN => $earliest,
        WHENRELATIVE => my_ago($earliest),
        TAGS => $uniqtags,
      }, \$alllinksout) or die $tmpl->error();

    last if $numitems++ > $opt_maxitems;
  }

  $tmpl->process(\$templates->{ALL}, {
      LINKS => $alllinksout
    }, $outputfile) or die $tmpl->error();

  print "writing: $outputfile\n";
}

sub summarise_rss {
  my $templates = shift;
  my $level = shift;
  my $outputfile = shift;

  my $tmpl = Template->new;

  my $rss = new XML::RSS(version => '1.0');

  $rss->channel (
    title => "Spicylinks - Level $level",
    link => "http://taint.org/spicylinks/",
    description => "Spicylinks - Level $level",
    dc => {
      language   => 'en-us',
      publisher  => 'jm-spicylinks@jmason.org'
    }
  );

  foreach my $item (@sorted_links) {
    my $count = scalar @{$links->{$item}};
    next unless ($count >= $level);

    my $itemout = '';
    my $linkout = '';
    my $alltags = '';
    my $earliest = '';
    my $latest = '';
    my $toptitle = '';
    foreach my $ent (@{$links->{$item}}) {
      my $tags = ($ent->{tags} || '');
      $alltags .= " ".$tags;

      my $when = $ent->{when};
      if ($when) {
        if (!$earliest || $when lt $earliest) {
          $earliest = $when;
        }
        if (!$latest || $when gt $latest) {
          $latest = $when;
        }
      }

      if (!$toptitle) {
        $toptitle = $ent->{title};
      }

      $tmpl->process(\$templates->{PERSON}, {
          WHO => $ent->{who},
          WHOPAGE => $ent->{whopage},
          TITLE => ($ent->{title} || ''),
          DESC => ($ent->{desc} || ''),
          WHEN => $when,
          WHENRELATIVE => my_ago($when),
          TAGS => $tags,
        }, \$linkout) or die $tmpl->error();
    }

    # uniq the tag list
    my %u=();
    my $uniqtags = join ' ', sort grep {defined} map {
          if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; }
        } split ' ', $alltags;
    undef %u;

    my $when = $latest;

    $tmpl->process(\$templates->{LINK}, {
        COUNT => $count,
        LINK => $item,
        PEOPLE => $linkout,
        WHEN => $when,
        WHENRELATIVE => my_ago($when),
        TAGS => $uniqtags,
      }, \$itemout) or die $tmpl->error();

    $rss->add_item(
      title => ($toptitle || '(Untitled)'),
      link => $item,
      description => $itemout,
      dc => {
        date => $when,
        subject => $uniqtags
      },
    );
  }

  open (OUT, ">$outputfile") or die "failed to write to $outputfile";
  eval { binmode OUT, ":encoding(utf8)"; };
  print OUT $rss->as_string;
  close OUT or die "failed to write to $outputfile";

  print "writing: $outputfile\n";
}

# ---------------------------------------------------------------------------

sub mech_cache_new {
  my ($mech) = @_;
  my $cache = $ENV{HOME}."/.leechrss/cache";
  (-d $cache) or mkpath ($cache);
  my $self = {
    mech => $mech,
    cache => $cache
  };
  $self;
}

sub get_with_cache {
  my ($self, $link) = @_;

  my ($status, $uri, $content) = cache_get ($self, $link);
  if ($uri) {
    warn "cached: $link\n";
    $self->{last_status} = $status;
    return ($uri, $content, 0);
  }
  else {
    warn "get: $link\n";

    eval {
      my $resp = $self->{mech}->get ($link);
      $uri = $self->{mech}->uri;
      $content = Encode::decode_utf8 $self->{mech}->content;
      $self->{last_status} = $self->{mech}->status;
      my $expires = int $resp->fresh_until();
      cache_put ($self, $link, $uri, $content, $expires, $self->{last_status});
      1;
    } or warn $@;

    if ($self->{last_status} =~ /^2/) {
      return ($uri, $content, 1);
    }
  }
  return;
}

sub last_get_status {
  my ($self) = @_;
  return $self->{last_status};
}

sub cache_put {
  my ($self, $link, $uri, $content, $expires, $status) = @_;

  my $cfile = $link;
  $cfile =~ s/[^A-Za-z0-9_]/_/gs;
  $cfile = "$self->{cache}/$cfile";
  open (OUT, ">$cfile") or die "cannot write $cfile";
  eval { binmode OUT, ":encoding(utf8)"; };
  print OUT "Status: $status\n";
  print OUT "Expires: $expires\n";
  if (defined $uri) {
    print OUT "URI: $uri\n";
  }
  print OUT "\n";
  print OUT $content;
  close OUT;
}

sub cache_get {
  my ($self, $link) = @_;

  my $cfile = $link;
  $cfile =~ s/[^A-Za-z0-9_]/_/gs;
  $cfile = "$self->{cache}/$cfile";
  open (IN, "<$cfile") or return ();

  my $uri;
  my $status;
  my $expires;
  while (<IN>) {
    s/\r?\n$//s;
    /^URI: (.*)$/ and $uri = $1;
    /^Status: (.*)$/ and $status = $1;
    /^Expires: (.*)$/ and $expires = $1;
    /^$/ and last;
  }
  my $content = join ('', <IN>);
  close IN;

  if (time > $expires) {
    # the cached copy has expired.  return nothing
    return ();
  }

  return ($status, $uri, $content);
}

# ---------------------------------------------------------------------------

sub usage {
  pod2usage(
    -verbose => 0,
    -exitval => 1
  );
}

1;

