#
# Harvester
#
#
# harvester.pl -- Harvests Feeds
# More information: http://www.downes.ca/edurss02.htm
#
# by Stephen Downes
# Version 0.2 - January 2, 2006
#
#
# Copyright 2006 Stephen Downes, National Research Council Canada
# Prerelease - please do not distribute - I'll GPL it when it's ready
#
#----------------------------------------------------------------------------
#
# Harvest Functions
#
#----------------------------------------------------------------------------
sub harvest {
my ($dbh,$vars,$options,$person,$all) = @_;
unless ($vars->{mode} eq "silent") {
unless ($all) { print &header($dbh,$vars,$options,$person); print "
Harvest Results
"; }
}
$vars->{feed} = &harvest_queue($dbh) unless ($vars->{feed});
my $sql = "SELECT * FROM feed WHERE feed_id = '$vars->{feed}' LIMIT 1";
my $sth = $dbh->prepare($sql);
$sth->execute();
my $feed_info = $sth -> fetchrow_hashref();
&error($dbh,$vars,$person,"Feed has no RSS URL") unless ($feed_info->{feed_link});
unless ($vars->{mode} eq "silent") { print qq||; }
&parse_url($dbh,$vars,$options,$person,$feed_info);
if ($all) { return; } else {
unless ($vars->{mode} eq "silent") { print &footer($dbh,$vars,$options,$person); } exit;
}
exit;
}
sub harvest_all {
my ($dbh,$vars,$options,$person) = @_;
unless ($vars->{mode} eq "silent") {
unless ($all) { print &header($dbh,$vars,$options,$person); print "Harvest Results
"; }
}
$stmt = "SELECT * FROM feed WHERE feed_status = 'A'";
my $sth = $dbh->prepare($stmt);
$sth->execute();
while (my $ref = $sth -> fetchrow_hashref()) {
$vars->{feed} = $ref->{feed_id};
&harvest($dbh,$vars,$options,$person,1);
}
if ($all) { return; } else {
unless ($vars->{mode} eq "silent") { print &footer($dbh,$vars,$options,$person); } exit;
}
exit;
}
sub harvest_queue { # Harvests next item in queue, updates queue
my ($dbh) = @_;
my $stmt = "SELECT queue_id,queue_feed FROM queue ORDER BY queue_crdate LIMIT 0,1";
my $sth = $dbh->prepare($stmt);
$sth->execute();
my $ref = $sth -> fetchrow_hashref();
my $now = time;
die "Queue select error" unless ($ref->{queue_id});
&db_update($dbh,"queue",{queue_crdate=>$now},$ref->{queue_id});
return $ref->{queue_feed};
}
sub parse_url {
my ($dbh,$vars,$options,$person,$feed_info) = @_;
my $url = $feed_info->{feed_link};
scalar $url or die "usage: shownodes xmlfile\n";
my ($content,$error) = &get_rss($url);
unless ($content) {
unless ($vars->{mode} eq "silent") { print "No content from URL
\n"; }
return;
}
if ($error) {
unless ($vars->{mode} eq "silent") { print "$error
"; }
return;
}
# Get Feed Info
my $feed_journal=&feed_journal($dbh,$vars,$options,$person,$feed_info);
my $feed_author=&feed_author($dbh,$vars,$options,$person,$feed_info);
my $feed = new Feed;
$feed->{feed_id} = $vars->{feed};
$content =~ s/^\s*; # Remove leading white space from XML (a very common error)
my $parser = XML::LibXML->new();
my $tree;
eval { $tree = $parser->parse_string($content) };
if ($@) { unless ($vars->{mode} eq "silent") { print "Parser error
"; return; } }
my $root = $tree->getDocumentElement;
my $link = ();
&show($vars,$feed,$link,$root->nodeName,$root->childNodes);
$vars->{$idfield} = &in_save_record($dbh,$person,$options,"feed",$feed); # Update Feed Data
my $msg;
unless ($vars->{mode} eq "silent") { print "\n"; }
foreach my $li (@{$feed->{link}}) {
unless ($vars->{mode} eq "silent") { print qq|
|,$li->{link_title},qq|\n|; }
next unless ($li->{link_link}); # No blank links
unless ($li->{link_title}) { $li->{link_title} = "Untitled"; }
if (&db_locate($dbh,"link",{link_link => $li->{link_link},link_status=>'primary'})) {
unless ($vars->{mode} eq "silent") { print "Already there.\n"; }
} else {
$li->{link_status} = "primary"; # Flag as primary source
$li->{link_crdate} = time; # crdate
$li->{link_creator} = $person->{person_id}; # creator
$li->{link_feedid} = $vars->{feed};
$li->{link_journal} = $feed_journal->{journal_id}; # Journal
# Collect cites
my $docroot = $li->{link_link};
$docroot =~ s|http://||i; my @docarr = split /\//,$docroot; $docroot = shift @docarr;
my $extract = $li->{link_description};
# Adjust Description
if ($Site->{hv_htm} eq "no") {
$li->{link_description} =~ s/(<|<)(.*?)(>|>)//gi; # Strip HTML
$li->{link_title} =~ s/(<|<)(.*?)(>|>)//gi;
if ($Site->{hv_trim}) {
$li->{link_description} = substr $li->{link_description}, 0, $Site->{hv_trim};
}
}
# Check for prior (secondary) instance, overwrite if it exists
$li->{link_id} = &db_locate($dbh,"link",{link_link => $li->{link_link}});
# Save Input Record
my $linkid = &in_save_record($dbh,$person,$options,"link",$li);
if ($linkid) {
# Link Author
my $link_author = &link_author($dbh,$vars,$options,$person,$li);
unless ($link_author) { $link_author = $feed_author; }
if ($link_author->{author_id}) {
my $la = &db_insert($dbh,"link_author",{link_author_link=>$linkid,
link_author_author=>$link_author->{author_id},
link_author_journal=>$feed_journal->{journal_id}});
}
unless ($vars->{mode} eq "silent") { print "Harvested OK\n"; }
# Link Cites
$extract =~ s/\/A/\/a/ig; # Just in case
$extract =~ s/<//ig;
while ($extract =~ /
(.*?)<\/a>/gi) {
my $ldata = $1; my $ltit = $2; my $lref; my $lid;
$ltit =~ s/(<|<)(.*?)(>|>)//gi;
if ($ldata =~ /href="(.*?)"/) { $lref=$1; } else { $lref=""; }
next unless $lref;
next if ($lref =~ /$docroot/); # Skip local links
$lref =~ s/&/&/g;
$lid = &db_locate($dbh,"link",{link_link=>$lref});
unless ($lid) { # Save a placeholder link that will be over-written if ever harvested
my $cr = time;
$lid = &db_insert($dbh,"link",{link_link=>$lref,link_title=>$ltit,
link_status=>'secondary',link_crdate=>$cr});
}
unless ($lid) { die "Ack! Pfft! Why couldn't I create a link? $lref"; }
&db_insert($dbh,"cite",{cite_cited=>$lid,cite_citer=>$linkid,cite_citerfeed=>$feed->{feed_id}});
my $replace = "".$ltit."";
$extract =~ s/\Q$replace\E//g;
}
} else { unless ($vars->{mode} eq "silent") { print "Save error!"; } }
}
unless ($vars->{mode} eq "silent") { print "
"; }
}
unless ($vars->{mode} eq "silent") { print "
"; }
#exit;
}
sub feed_journal { # Finds the publication associated with a feed and returns it
my ($dbh,$vars,$options,$person,$feed) = @_;
my $pubid; my $stmt; my $ud;
if ($feed->{feed_journal}) { # Does it have one associated?
$stmt = "SELECT * FROM journal WHERE journal_id = '$feed->{feed_journal}' LIMIT 1";
}
else { # Nope...
if ($feed->{feed_html}) {
$ud = 1;
$stmt = "SELECT * FROM journal WHERE journal_link = '$feed->{feed_html}' LIMIT 1";
}
}
if ($stmt) {
my $sth = $dbh->prepare($stmt);
$sth->execute();
$feed_journal = $sth -> fetchrow_hashref();
}
if ($feed_journal) {
if ($ud eq "1") { # Update feed info if a journal had to be found
&db_update($dbh,"feed",{feed_journal=>$feed_journal->{journal_id}},$feed->{feed_id});
}
return $feed_journal;
} else { return 0; }
}
sub feed_author {
my ($dbh,$vars,$options,$person,$feed) = @_;
my $pubid; my $stmt; my $ud; my $feed_author;
if ($feed->{feed_author}) { # Does it have one associated?
$stmt = "SELECT * FROM author WHERE author_id = '$feed->{feed_author}' LIMIT 1";
my $sth = $dbh->prepare($stmt);
$sth->execute();
$feed_author = $sth -> fetchrow_hashref();
if ($feed_author) { return $feed_author; }
}
else {
$feed_author =
&try_match($dbh,"feed",$feed,"feed_authorurl","author","author_link") ||
&try_match($dbh,"feed",$feed,"feed_html","author","author_link") ||
&try_match($dbh,"feed",$feed,"feed_authorname","author","author_name") ||
&try_match($dbh,"feed",$feed,"feed_authoremail","author","author_email");
if ($feed_author) {
&db_update($dbh,"feed",{feed_author=>$feed_author->{author_id}},$feed->{feed_id});
return $feed_author;
}
}
return 0;
}
sub link_author {
my ($dbh,$vars,$options,$person,$link) = @_;
$link_author =
&try_match($dbh,"link",$link,"link_creatorurl","author","author_link") ||
&try_match($dbh,"link",$link,"link_creatorname","author","author_link") ||
&try_match($dbh,"link",$link,"link_creatoremail","author","author_email");
if ($link_author) {
return $link_author;
}
return 0;
}
sub try_match {
# Eg, to match feed->{feed_authorurl} to author->{author_link) :
# &try_match($dbh,"feed",$feedhash,"feed_authorurl","author","author_link");
my ($dbh,$table,$tabledata,$tableitem,$mtable,$mitem,) = @_;
my $matchitem;
if ($tabledata->{$tableitem}) {
$stmt = "SELECT * FROM $mtable WHERE $mitem = '$tabledata->{$tableitem}' LIMIT 1";
my $sth = $dbh->prepare($stmt);
$sth->execute();
$matchitem = $sth -> fetchrow_hashref();
if ($matchitem) {
return $matchitem;
}
}
}
sub show {
my ($vars,$feed,$link,$parent,@nodes) = @_;
foreach my $node (@nodes) {
next unless($node);
#next if ($node->nodeName eq "cdata");
#next if ($node->nodeName eq "text");
my $nodetext = $node->textContent;
my $nodename = lc($node->nodeName);
if ($nodetext) { $nodetext =~ s/(\n|\r)/ /g; $nodetext =~ s/\s+/ /g; if ($nodetext eq " ") { $nodetext = ""; } }
my $atth=();
if ($node->hasAttributes()) {
my @attributelist = $node->attributes();
foreach $att (@attributelist) { $atth->{$att->name} = $att->value; }
}
if ($node->nodeName eq "item" || $node->nodeName eq "entry") { # Speed through item declarations
$link = new Link;
push @{$feed->{link}},$link;
show($vars,$feed,$link,$node->nodeName,$node->childNodes);
}
# RSS Items
if ($node->parentNode->nodeName =~ /item/i) {
if ($nodename eq "title") { $link->{link_title} = $nodetext; } # Title
if ($nodename eq "link") { $link->{link_link} = $nodetext; } # Link
if ($node->nodeName =~ /description|content/i) { $link->{link_description} = $nodetext; } # Description
if ($nodename eq "summary") { $link->{link_summary} = $nodetext; }
if ($nodename eq "guid") { $link->{link_guid} = $nodetext; } # Guid
if ($nodename eq "category") { $link->{link_category} = $nodetext; } # Category
if ($nodename eq "pubDate") { $link->{link_pubdate} = $nodetext; } # Dates
if ($nodename eq "dc:creator") { $link->{link_creatorname} = $nodetext;} # Feed author
if ($nodename eq "dc:publisher") { $link->{link_publisher} = $nodetext;} # Feed publisher
}
# Atom Items
elsif ($node->parentNode->nodeName =~ /entry/i) {
if ($nodename eq "title") { $link->{link_title} = $nodetext; } # Title
if ($nodename eq "link") { # Link
if ($atth->{rel} =~ /post/i) { $link->{link_link} = $atth->{href}; }
elsif ($atth->{rel} =~ /alternate/i) {
unless ($link->{link_link}) { $link->{link_link} = $atth->{href}; }
}
elsif ($atth->{rel} =~ /edit/i) { $link->{link_edit} = $atth->{href}; }
}
if ($nodename eq "content") { $link->{link_description} = $nodetext; } # Description
if ($nodename eq "summary") { $link->{link_summary} = $nodetext; }
if ($nodename eq "guid") { $link->{link_guid} = $nodetext; } # Guid
if ($nodename eq "id") { $link->{link_identifier} = $nodetext;} # Identifier
if ($nodename eq "category") { $link->{link_category} = $nodetext; } # Category
if ($nodename eq "pubDate") { $link->{link_pubdate} = $nodetext; } # Dates
if ($nodename eq "created") { $link->{link_created} = $nodetext; }
if ($nodename eq "modified") { $link->{link_modified} = $nodetext; }
if ($nodename eq "issued") { $link->{link_issued} = $nodetext; }
if ($nodename eq "author") { # *sigh*
if ($node->hasChildNodes()) {
my ($na) = $node->getChildrenByTagName("name");
if ($na) { $link->{link_creatorname} = $na->textContent; }
($nb) = $node->getChildrenByTagName("url");
if ($nb) { $link->{link_creatorurl} = $nb->textContent; }
($nc) = $node->getChildrenByTagName("email");
if ($nc) { $link->{link_creatoremail} = $nc->textContent; }
}
}
}
# RSS Channels
elsif ($node->parentNode->nodeName =~ /channel/i) {
if ($nodename eq "title") { $feed->{feed_title} = $nodetext;} # Feed Title
if ($nodename eq "description") { $feed->{feed_description} = $nodetext;} # Feed Description
if ($nodename eq "link") { $feed->{feed_html} = $nodetext;} # Feed HTML Link
if ($nodename eq "lastBuildDate") { $feed->{feed_lastBuildDate} = $nodetext;} # Feed Last Build Date
if ($nodename eq "pubDate") { $feed->{feed_pubDate} = $nodetext; } # Feed Publication Date
if ($nodename eq "generator") { $feed->{feed_genname} = $nodetext;} # Feed Generator
if ($nodename eq "docs") { $feed->{feed_docs} = $nodetext;} # Feed Documents
if ($nodename eq "version") { $feed->{feed_version} = $nodetext;} # Feed Version
if ($nodename eq "managingEditor") { $feed->{feed_managingEditor} = $nodetext;} # Feed Managing Editor
if ($nodename eq "webMaster") { $feed->{feed_webMaster} = $nodetext;} # Feed webmaster
if ($nodename eq "dc:creator") { $feed->{feed_creatorname} = $nodetext;} # Feed author
if ($nodename eq "dc:publisher") { $feed->{feed_publisher} = $nodetext;} # Feed publisher
}
# Atom Channels
elsif ($node->parentNode->nodeName =~ /feed/i) {
if ($nodename eq "title") { print "$feed->{feed_title} ";$feed->{feed_title} = $nodetext;} # Feed Title
if ($nodename eq "link") { # Link
if ($atth->{rel} =~ /post/i) { $feed->{feed_html} = $atth->{href}; }
elsif ($atth->{rel} =~ /alternate/i) {
unless ($feed->{feed_html}) { $feed->{feed_html} = $atth->{href}; }
}
elsif ($atth->{rel} =~ /edit/i) { $feed->{feed_edit} = $atth->{href}; }
}
if ($nodename eq "tagline") { $feed->{feed_description} = $nodetext;} # Feed Description
if ($nodename eq "modified") { $feed->{feed_lastBuildDate} = $nodetext;} # Feed Last Build Date
if ($nodename eq "author") { # *sigh*
if ($node->hasChildNodes()) {
my ($na) = $node->getChildrenByTagName("name");
if ($na) { $feed->{feed_creatorname} = $na->textContent; }
my ($nb) = $node->getChildrenByTagName("url");
if ($nb) { $feed->{feed_creatorurl} = $nb->textContent; }
my ($nc) = $node->getChildrenByTagName("email");
if ($nc) { $feed->{feed_creatoremail} = $nc->textContent; }
}
}
}
if ($node->childNodes) { &show($vars,$feed,$link,$node->nodeName,$node->childNodes); }
}
}
sub get_rss {
my $url = shift; my $error;
unless ($url) { return ("No content","URL doesn't exist"); }
my $ua = new LWP::UserAgent;
$ua->agent("Edu_RSS/0.2 " . $ua->agent);
$ua -> timeout(15);
my $req = HTTP::Request->new(GET => $url);
# $req->content_type('application/x-www-form-urlencoded');
# $req->header('Content-Length' => $ENV{'CONTENT_LENGTH'});
# $req->content('match=www&errors=0');
my $res = $ua->request($req);
if (!$res->is_success) {
my $res_error = $res->status_line;
return ("No content", "Could not access file: $res_error");
}
my $webpage = $res->content;
if ($webpage eq "") { return ("No content", "File was blank."); }
my $debug_flag=0;
#$webpage = &clean_xml($webpage);
return ($webpage,$error);
}
sub create_queue {
my ($dbh,$options,$vars,$person,$opml) = @_;
die "Permission Denied" unless $person->{person_status} eq "admin";
my $stmt = "DELETE FROM queue"; # Wipe out old queue
$dbh->do($stmt,undef);
$stmt = "SELECT * FROM feed WHERE feed_status = 'A'";
my $sth = $dbh->prepare($stmt);
$sth->execute();
while (my $ref = $sth -> fetchrow_hashref()) {
#next unless ($fv->{feed_status} eq "A");
my $qtime = time;
&db_insert($dbh,'queue',{queue_feed=>$ref->{feed_id},queue_crdate=>$qtime});
}
$options->{msg} .= "A new queue has been created.";
unless ($opml) { &admin_main($dbh,$vars,$options,$person); }
return;
}
sub opml_options { # Screen to import OPML
my ($dbh,$options,$vars,$person) = @_;
print &header($dbh,$vars,$options,$person);
print qq|
Import OPML
This screen allows you to upload an OPML file into your feed list. Existing feeds will not be replaced.
Select your OPML file: