# # Harvester # # # harvester.pl -- Harvests Feeds # More information: http://www.downes.ca/edurss02.htm # # by Stephen Downes # Version 0.2 - January 2, 2006 # # # Copyright 2006 Stephen Downes, National Research Council Canada # Prerelease - please do not distribute - I'll GPL it when it's ready # #---------------------------------------------------------------------------- # # Harvest Functions # #---------------------------------------------------------------------------- sub harvest { my ($dbh,$vars,$options,$person,$all) = @_; unless ($vars->{mode} eq "silent") { unless ($all) { print &header($dbh,$vars,$options,$person); print "

Harvest Results

"; } } $vars->{feed} = &harvest_queue($dbh) unless ($vars->{feed}); my $sql = "SELECT * FROM feed WHERE feed_id = '$vars->{feed}' LIMIT 1"; my $sth = $dbh->prepare($sql); $sth->execute(); my $feed_info = $sth -> fetchrow_hashref(); &error($dbh,$vars,$person,"Feed has no RSS URL") unless ($feed_info->{feed_link}); unless ($vars->{mode} eq "silent") { print qq|

Harvesting: $feed_info->{feed_link}
|; } &parse_url($dbh,$vars,$options,$person,$feed_info); if ($all) { return; } else { unless ($vars->{mode} eq "silent") { print &footer($dbh,$vars,$options,$person); } exit; } exit; } sub harvest_all { my ($dbh,$vars,$options,$person) = @_; unless ($vars->{mode} eq "silent") { unless ($all) { print &header($dbh,$vars,$options,$person); print "

Harvest Results

"; } } $stmt = "SELECT * FROM feed WHERE feed_status = 'A'"; my $sth = $dbh->prepare($stmt); $sth->execute(); while (my $ref = $sth -> fetchrow_hashref()) { $vars->{feed} = $ref->{feed_id}; &harvest($dbh,$vars,$options,$person,1); } if ($all) { return; } else { unless ($vars->{mode} eq "silent") { print &footer($dbh,$vars,$options,$person); } exit; } exit; } sub harvest_queue { # Harvests next item in queue, updates queue my ($dbh) = @_; my $stmt = "SELECT queue_id,queue_feed FROM queue ORDER BY queue_crdate LIMIT 0,1"; my $sth = $dbh->prepare($stmt); $sth->execute(); my $ref = $sth -> fetchrow_hashref(); my $now = time; die "Queue select error" unless ($ref->{queue_id}); &db_update($dbh,"queue",{queue_crdate=>$now},$ref->{queue_id}); return $ref->{queue_feed}; } sub parse_url { my ($dbh,$vars,$options,$person,$feed_info) = @_; my $url = $feed_info->{feed_link}; scalar $url or die "usage: shownodes xmlfile\n"; my ($content,$error) = &get_rss($url); unless ($content) { unless ($vars->{mode} eq "silent") { print "No content from URL
\n"; } return; } if ($error) { unless ($vars->{mode} eq "silent") { print "$error
"; } return; } # Get Feed Info my $feed_journal=&feed_journal($dbh,$vars,$options,$person,$feed_info); my $feed_author=&feed_author($dbh,$vars,$options,$person,$feed_info); my $feed = new Feed; $feed->{feed_id} = $vars->{feed}; $content =~ s/^\s*new(); my $tree; eval { $tree = $parser->parse_string($content) }; if ($@) { unless ($vars->{mode} eq "silent") { print "Parser error
"; return; } } my $root = $tree->getDocumentElement; my $link = (); &show($vars,$feed,$link,$root->nodeName,$root->childNodes); $vars->{$idfield} = &in_save_record($dbh,$person,$options,"feed",$feed); # Update Feed Data my $msg; unless ($vars->{mode} eq "silent") { print "
\n"; } foreach my $li (@{$feed->{link}}) { unless ($vars->{mode} eq "silent") { print qq||,$li->{link_title},qq|\n|; } next unless ($li->{link_link}); # No blank links unless ($li->{link_title}) { $li->{link_title} = "Untitled"; } if (&db_locate($dbh,"link",{link_link => $li->{link_link},link_status=>'primary'})) { unless ($vars->{mode} eq "silent") { print "Already there.\n"; } } else { $li->{link_status} = "primary"; # Flag as primary source $li->{link_crdate} = time; # crdate $li->{link_creator} = $person->{person_id}; # creator $li->{link_feedid} = $vars->{feed}; $li->{link_journal} = $feed_journal->{journal_id}; # Journal # Collect cites my $docroot = $li->{link_link}; $docroot =~ s|http://||i; my @docarr = split /\//,$docroot; $docroot = shift @docarr; my $extract = $li->{link_description}; # Adjust Description if ($Site->{hv_htm} eq "no") { $li->{link_description} =~ s/(<|<)(.*?)(>|>)//gi; # Strip HTML $li->{link_title} =~ s/(<|<)(.*?)(>|>)//gi; if ($Site->{hv_trim}) { $li->{link_description} = substr $li->{link_description}, 0, $Site->{hv_trim}; } } # Check for prior (secondary) instance, overwrite if it exists $li->{link_id} = &db_locate($dbh,"link",{link_link => $li->{link_link}}); # Save Input Record my $linkid = &in_save_record($dbh,$person,$options,"link",$li); if ($linkid) { # Link Author my $link_author = &link_author($dbh,$vars,$options,$person,$li); unless ($link_author) { $link_author = $feed_author; } if ($link_author->{author_id}) { my $la = &db_insert($dbh,"link_author",{link_author_link=>$linkid, link_author_author=>$link_author->{author_id}, link_author_journal=>$feed_journal->{journal_id}}); } unless ($vars->{mode} eq "silent") { print "Harvested OK\n"; } # Link Cites $extract =~ s/\/A/\/a/ig; # Just in case $extract =~ s/<//ig; while ($extract =~ /(.*?)<\/a>/gi) { my $ldata = $1; my $ltit = $2; my $lref; my $lid; $ltit =~ s/(<|<)(.*?)(>|>)//gi; if ($ldata =~ /href="(.*?)"/) { $lref=$1; } else { $lref=""; } next unless $lref; next if ($lref =~ /$docroot/); # Skip local links $lref =~ s/&/&/g; $lid = &db_locate($dbh,"link",{link_link=>$lref}); unless ($lid) { # Save a placeholder link that will be over-written if ever harvested my $cr = time; $lid = &db_insert($dbh,"link",{link_link=>$lref,link_title=>$ltit, link_status=>'secondary',link_crdate=>$cr}); } unless ($lid) { die "Ack! Pfft! Why couldn't I create a link? $lref"; } &db_insert($dbh,"cite",{cite_cited=>$lid,cite_citer=>$linkid,cite_citerfeed=>$feed->{feed_id}}); my $replace = "".$ltit.""; $extract =~ s/\Q$replace\E//g; } } else { unless ($vars->{mode} eq "silent") { print "Save error!"; } } } unless ($vars->{mode} eq "silent") { print "
"; } } unless ($vars->{mode} eq "silent") { print "
"; } #exit; } sub feed_journal { # Finds the publication associated with a feed and returns it my ($dbh,$vars,$options,$person,$feed) = @_; my $pubid; my $stmt; my $ud; if ($feed->{feed_journal}) { # Does it have one associated? $stmt = "SELECT * FROM journal WHERE journal_id = '$feed->{feed_journal}' LIMIT 1"; } else { # Nope... if ($feed->{feed_html}) { $ud = 1; $stmt = "SELECT * FROM journal WHERE journal_link = '$feed->{feed_html}' LIMIT 1"; } } if ($stmt) { my $sth = $dbh->prepare($stmt); $sth->execute(); $feed_journal = $sth -> fetchrow_hashref(); } if ($feed_journal) { if ($ud eq "1") { # Update feed info if a journal had to be found &db_update($dbh,"feed",{feed_journal=>$feed_journal->{journal_id}},$feed->{feed_id}); } return $feed_journal; } else { return 0; } } sub feed_author { my ($dbh,$vars,$options,$person,$feed) = @_; my $pubid; my $stmt; my $ud; my $feed_author; if ($feed->{feed_author}) { # Does it have one associated? $stmt = "SELECT * FROM author WHERE author_id = '$feed->{feed_author}' LIMIT 1"; my $sth = $dbh->prepare($stmt); $sth->execute(); $feed_author = $sth -> fetchrow_hashref(); if ($feed_author) { return $feed_author; } } else { $feed_author = &try_match($dbh,"feed",$feed,"feed_authorurl","author","author_link") || &try_match($dbh,"feed",$feed,"feed_html","author","author_link") || &try_match($dbh,"feed",$feed,"feed_authorname","author","author_name") || &try_match($dbh,"feed",$feed,"feed_authoremail","author","author_email"); if ($feed_author) { &db_update($dbh,"feed",{feed_author=>$feed_author->{author_id}},$feed->{feed_id}); return $feed_author; } } return 0; } sub link_author { my ($dbh,$vars,$options,$person,$link) = @_; $link_author = &try_match($dbh,"link",$link,"link_creatorurl","author","author_link") || &try_match($dbh,"link",$link,"link_creatorname","author","author_link") || &try_match($dbh,"link",$link,"link_creatoremail","author","author_email"); if ($link_author) { return $link_author; } return 0; } sub try_match { # Eg, to match feed->{feed_authorurl} to author->{author_link) : # &try_match($dbh,"feed",$feedhash,"feed_authorurl","author","author_link"); my ($dbh,$table,$tabledata,$tableitem,$mtable,$mitem,) = @_; my $matchitem; if ($tabledata->{$tableitem}) { $stmt = "SELECT * FROM $mtable WHERE $mitem = '$tabledata->{$tableitem}' LIMIT 1"; my $sth = $dbh->prepare($stmt); $sth->execute(); $matchitem = $sth -> fetchrow_hashref(); if ($matchitem) { return $matchitem; } } } sub show { my ($vars,$feed,$link,$parent,@nodes) = @_; foreach my $node (@nodes) { next unless($node); #next if ($node->nodeName eq "cdata"); #next if ($node->nodeName eq "text"); my $nodetext = $node->textContent; my $nodename = lc($node->nodeName); if ($nodetext) { $nodetext =~ s/(\n|\r)/ /g; $nodetext =~ s/\s+/ /g; if ($nodetext eq " ") { $nodetext = ""; } } my $atth=(); if ($node->hasAttributes()) { my @attributelist = $node->attributes(); foreach $att (@attributelist) { $atth->{$att->name} = $att->value; } } if ($node->nodeName eq "item" || $node->nodeName eq "entry") { # Speed through item declarations $link = new Link; push @{$feed->{link}},$link; show($vars,$feed,$link,$node->nodeName,$node->childNodes); } # RSS Items if ($node->parentNode->nodeName =~ /item/i) { if ($nodename eq "title") { $link->{link_title} = $nodetext; } # Title if ($nodename eq "link") { $link->{link_link} = $nodetext; } # Link if ($node->nodeName =~ /description|content/i) { $link->{link_description} = $nodetext; } # Description if ($nodename eq "summary") { $link->{link_summary} = $nodetext; } if ($nodename eq "guid") { $link->{link_guid} = $nodetext; } # Guid if ($nodename eq "category") { $link->{link_category} = $nodetext; } # Category if ($nodename eq "pubDate") { $link->{link_pubdate} = $nodetext; } # Dates if ($nodename eq "dc:creator") { $link->{link_creatorname} = $nodetext;} # Feed author if ($nodename eq "dc:publisher") { $link->{link_publisher} = $nodetext;} # Feed publisher } # Atom Items elsif ($node->parentNode->nodeName =~ /entry/i) { if ($nodename eq "title") { $link->{link_title} = $nodetext; } # Title if ($nodename eq "link") { # Link if ($atth->{rel} =~ /post/i) { $link->{link_link} = $atth->{href}; } elsif ($atth->{rel} =~ /alternate/i) { unless ($link->{link_link}) { $link->{link_link} = $atth->{href}; } } elsif ($atth->{rel} =~ /edit/i) { $link->{link_edit} = $atth->{href}; } } if ($nodename eq "content") { $link->{link_description} = $nodetext; } # Description if ($nodename eq "summary") { $link->{link_summary} = $nodetext; } if ($nodename eq "guid") { $link->{link_guid} = $nodetext; } # Guid if ($nodename eq "id") { $link->{link_identifier} = $nodetext;} # Identifier if ($nodename eq "category") { $link->{link_category} = $nodetext; } # Category if ($nodename eq "pubDate") { $link->{link_pubdate} = $nodetext; } # Dates if ($nodename eq "created") { $link->{link_created} = $nodetext; } if ($nodename eq "modified") { $link->{link_modified} = $nodetext; } if ($nodename eq "issued") { $link->{link_issued} = $nodetext; } if ($nodename eq "author") { # *sigh* if ($node->hasChildNodes()) { my ($na) = $node->getChildrenByTagName("name"); if ($na) { $link->{link_creatorname} = $na->textContent; } ($nb) = $node->getChildrenByTagName("url"); if ($nb) { $link->{link_creatorurl} = $nb->textContent; } ($nc) = $node->getChildrenByTagName("email"); if ($nc) { $link->{link_creatoremail} = $nc->textContent; } } } } # RSS Channels elsif ($node->parentNode->nodeName =~ /channel/i) { if ($nodename eq "title") { $feed->{feed_title} = $nodetext;} # Feed Title if ($nodename eq "description") { $feed->{feed_description} = $nodetext;} # Feed Description if ($nodename eq "link") { $feed->{feed_html} = $nodetext;} # Feed HTML Link if ($nodename eq "lastBuildDate") { $feed->{feed_lastBuildDate} = $nodetext;} # Feed Last Build Date if ($nodename eq "pubDate") { $feed->{feed_pubDate} = $nodetext; } # Feed Publication Date if ($nodename eq "generator") { $feed->{feed_genname} = $nodetext;} # Feed Generator if ($nodename eq "docs") { $feed->{feed_docs} = $nodetext;} # Feed Documents if ($nodename eq "version") { $feed->{feed_version} = $nodetext;} # Feed Version if ($nodename eq "managingEditor") { $feed->{feed_managingEditor} = $nodetext;} # Feed Managing Editor if ($nodename eq "webMaster") { $feed->{feed_webMaster} = $nodetext;} # Feed webmaster if ($nodename eq "dc:creator") { $feed->{feed_creatorname} = $nodetext;} # Feed author if ($nodename eq "dc:publisher") { $feed->{feed_publisher} = $nodetext;} # Feed publisher } # Atom Channels elsif ($node->parentNode->nodeName =~ /feed/i) { if ($nodename eq "title") { print "$feed->{feed_title}

";$feed->{feed_title} = $nodetext;} # Feed Title if ($nodename eq "link") { # Link if ($atth->{rel} =~ /post/i) { $feed->{feed_html} = $atth->{href}; } elsif ($atth->{rel} =~ /alternate/i) { unless ($feed->{feed_html}) { $feed->{feed_html} = $atth->{href}; } } elsif ($atth->{rel} =~ /edit/i) { $feed->{feed_edit} = $atth->{href}; } } if ($nodename eq "tagline") { $feed->{feed_description} = $nodetext;} # Feed Description if ($nodename eq "modified") { $feed->{feed_lastBuildDate} = $nodetext;} # Feed Last Build Date if ($nodename eq "author") { # *sigh* if ($node->hasChildNodes()) { my ($na) = $node->getChildrenByTagName("name"); if ($na) { $feed->{feed_creatorname} = $na->textContent; } my ($nb) = $node->getChildrenByTagName("url"); if ($nb) { $feed->{feed_creatorurl} = $nb->textContent; } my ($nc) = $node->getChildrenByTagName("email"); if ($nc) { $feed->{feed_creatoremail} = $nc->textContent; } } } } if ($node->childNodes) { &show($vars,$feed,$link,$node->nodeName,$node->childNodes); } } } sub get_rss { my $url = shift; my $error; unless ($url) { return ("No content","URL doesn't exist"); } my $ua = new LWP::UserAgent; $ua->agent("Edu_RSS/0.2 " . $ua->agent); $ua -> timeout(15); my $req = HTTP::Request->new(GET => $url); # $req->content_type('application/x-www-form-urlencoded'); # $req->header('Content-Length' => $ENV{'CONTENT_LENGTH'}); # $req->content('match=www&errors=0'); my $res = $ua->request($req); if (!$res->is_success) { my $res_error = $res->status_line; return ("No content", "Could not access file: $res_error"); } my $webpage = $res->content; if ($webpage eq "") { return ("No content", "File was blank."); } my $debug_flag=0; #$webpage = &clean_xml($webpage); return ($webpage,$error); } sub create_queue { my ($dbh,$options,$vars,$person,$opml) = @_; die "Permission Denied" unless $person->{person_status} eq "admin"; my $stmt = "DELETE FROM queue"; # Wipe out old queue $dbh->do($stmt,undef); $stmt = "SELECT * FROM feed WHERE feed_status = 'A'"; my $sth = $dbh->prepare($stmt); $sth->execute(); while (my $ref = $sth -> fetchrow_hashref()) { #next unless ($fv->{feed_status} eq "A"); my $qtime = time; &db_insert($dbh,'queue',{queue_feed=>$ref->{feed_id},queue_crdate=>$qtime}); } $options->{msg} .= "A new queue has been created."; unless ($opml) { &admin_main($dbh,$vars,$options,$person); } return; } sub opml_options { # Screen to import OPML my ($dbh,$options,$vars,$person) = @_; print &header($dbh,$vars,$options,$person); print qq|

Import OPML

This screen allows you to upload an OPML file into your feed list. Existing feeds will not be replaced.

Select your OPML file:
From file:

OR From URL:

Default feed status: Approved Pending Approval Retired



|; print &footer($dbh,$vars,$options,$person); exit; } sub upload_opml { my ($dbh,$options,$vars,$person,$query) = @_; print &header($dbh,$vars,$options,$person); my $msg = "

OPML Upload Results

"; my $content; my $error; if ($vars->{url}) { ($content,$error) = &get_rss($vars->{url}); &error($dbh,$vars,$person,"Could not get $vars->{url}") unless ($content); if($error) { &error($dbh,$vars,$person,$error); } } elsif ($vars->{file}) { my $ufilename = $query->upload('file'); while ( <$ufilename> ) { $content .= $_; } &error($dbh,$vars,$person,"Could not upload $ufilename") unless ($content); } else { &error($dbh,$vars,$person,"No input OPML source specified"); } $content =~ s/\n\r/ /mig; $content =~ s/OUTLINE/outline/mig; while ($content =~ //) { my $feed = new Feed; my $autotext = $1; my $line = $1; my $title; my $html; my $xml; my $type; if ($line =~ /title="(.*?)"/i) { $feed->{feed_title} = $1; } if ($line =~ /htmlUrl="(.*?)"/i) { $feed->{feed_html} = $1; } if ($line =~ /xmlUrl="(.*?)"/i) { $feed->{feed_link} = $1; } if ($line =~ /type="(.*?)"/i) { $feed->{feed_type} = $1; } $feed->{feed_status} = $vars->{status}; next unless ($feed->{feed_title} && $feed->{feed_link}); $msg .= "$feed->{feed_title} ($feed->{feed_type}) [{feed_html}\">HTML] [{feed_link}\">XML]"; if (&db_locate($dbh,"feed",{feed_link => $feed->{feed_link}})) { $msg .= " Feed already exists
"; } else { my $fid = &in_save_record($dbh,$person,$options,"feed",$feed); if ($fid) { $msg .= " Uploaded OK, ID=$fid"; } else { $msg .= " Upload error. Bleah."; } } $msg .= "
"; my $repl = qq||; $content =~ s/\Q$repl\E//; } print "
$msg
"; &create_queue($dbh,$options,$vars,$person); print &footer($dbh,$vars,$options,$person); exit; } sub approve_feed { my ($dbh,$options,$vars,$person) = @_; die "Permission Denied" unless $person->{person_status} eq "admin"; &error($dbh,$vars,$person,"Feed not specified") unless ($vars->{feed}); &db_update($dbh,"feed",{feed_status=>"A"},$vars->{feed}); $options->{msg} .= "Feed number $vars->{feed} approved."; &create_queue($dbh,$options,$vars,$person); &list_feeds($vars); return; } sub retire_feed { my ($dbh,$options,$vars,$person) = @_; die "Permission Denied" unless $person->{person_status} eq "admin"; &error($dbh,$vars,$person,"Feed not specified") unless ($vars->{feed}); &db_update($dbh,"feed",{feed_status=>"R"},$vars->{feed}); $options->{msg} .= "Feed number $vars->{feed} retired."; &create_queue($dbh,$options,$vars,$person); &list_feeds($vars); return; } sub list_feeds { my ($vars) = @_; $vars->{feed} = ""; $vars->{db} = "feed"; $vars->{number} = "200"; $vars->{all} = "yes"; $vars->{format} = "list"; $vars->{heading} = "List Feeds"; return; } package Link; use strict; sub new { my $class = shift; my $link = { }; my @commentary = (); my @referral = (); my @learning = (); my @media = (); my @taxon = (); $link->{commentary} = \@commentary; $link->{referral} = \@referral; $link->{media} = \@media; $link->{learning} = \@learning; $link->{taxon} = \@taxon; bless $link, $class; return $link; } 1;