#!/usr/bin/perl # # make_opml.cgi # Version 1.000 # # Stephen Downes -- http://www.downes.ca -- stephen@downes.ca # National Research Council Canada # # January 8, 2004 # # ------------------------------------------------------------------------------------------------ # # license # # ------------------------------------------------------------------------------------------------ # # Released under GNU Public License # # Copyright (C) # # This program is free software; you can redistribute it and/or modify it under the terms of the # GNU General Public License as published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # http://www.opensource.org/licenses/gpl-license.html # You should have received a copy of the GNU General Public License along with this program; if # not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # This General Public License does not permit incorporating your program into proprietary programs. # # $|++; # Turn off print buffering # INCLUDES use DBI; use XML::Parser::PerlSAX; use XML::Writer; use HTTP::Request; use LWP::UserAgent; use CGI qw(:cgi-lib); use CGI::Carp qw(fatalsToBrowser); # Declare variables my $arg; my $content; my $file; my $message; my $routine; my $current_file; my $debug_flag; my $current_uri; my $response; my %channel; print "Content-type: text/html\n\n"; print < Stephen's Web ~ OPML Generator ~ by Stephen Downes

OPML Generator
HTMLTOP my $cgi_q = new CGI; # Populate and clean input hash my $vars = $cgi_q->Vars; # Decisions, decisions if ($vars->{action} eq "Submit Data") { &get_number($vars); } elsif ($vars->{action} eq "Enter Number") { &get_feeds($vars); } elsif ($vars->{action} eq "Create OPML File") { &create_file($vars); } else { &create_form; } print <

Copyright © 2004 Stephen Downes
This work is licensed under a Creative Commons License.
ENDOFPAGE exit; sub create_form { print "
". "Hello. Welcome to Stephen's OPML creator.

". "An OPML file is a way for you to list the blogs or websites that you read ". "regularly. Normally, if you use an RSS Reader, we say that you are 'subscribed' ". "to these 'feeds'. But you don't need to be 'subscribed' to create an OPML file. ". "You just need to have a list of sites that you want to say that you read.

". "Need more info? Have a look at the ". "January 8 edition of OLDaily for the full scoop.

". "Before beginning, you should go to Feeds.Scripting.Com ". "and create an account. You will want to go back there after you have created your OPML file ". "in order to register it and let people know what sites you read.

". "OK, done that? Good. Now please provide the following information for the " . "head of your OPML file.\n" . "

\n". "\n". "\n". "\n". "\n". "\n
Title of your OPML File:
Your email address:
Your name or alias:
\n
". "Note that, while I will not be collecting any information you submit, you will be ". "creating a file containing this information on my website and you may ". "be sharing the contents of this file with the world. So, don't enter information ". "that you don't want to be public, OK?

"; } sub get_number { my $vars = shift; print "

". "Great! Here's what I got:

Title: ". $vars->{title}."
Email: ". $vars->{email}."
Name: ". $vars->{name}."

". "Make sure this is correct - if not, use the Back button and try again.

". "Now, enter the number of feeds you would like to include in your OPML file. ". "Don't worry about over-estimating a bit - blanks will be ignored. On the next page you will be ". "collecting the XML address for each feed you want to list. This number simply tells me ". "how many blank spaces to create.

\n". "

\n". "{title}."\">\n". "{email}."\">\n". "{name}."\">\n". "\n". "\n". "\n
Enter the Number of Sites to List:
\n

". "(No more than 200 sites, please.)

"; } sub get_feeds { my $vars = shift; unless ($vars->{number}) { print "No, no, you have to enter a number! No blank, not 0, a number!"; return; } if ($vars->{number} > 200) { print "Have a heart. My server can't take that much of a load. Please keep the number below 200.

"; return; } print "

\n
". "Rock and roll! Now let's fill out your OPML file.

". "In each space below, enter the XML address of the website, blog or feed you read regularly. ". "If you don't know the XML address, then in another browser window, go to the site's web page and look for a little ". "orange button, like this:

". "Move your cursor over the orange image on the web page and right-click. From the list ". "of options, select 'copy link address'. Then return to this page and paste the copied address into the form.

". "When you click \"Create OPML File\" this program will go to each XML file listed and ". "extract the information it needs. If there are any errors in the links, it will tell you. Don't worry ". "about the errors; you can use the Back button to fix any mistakes, and any mistakes ". "you don't fix will be ignored when the OPML file is created.

\n". "If you're just playing, here are a couple of file names you can paste into the form:

". "OLDaily: http://www.downes.ca/news/OLDaily.rss
" . "Seb Paquet: http://radio.weblogs.com/0110772/rss.xml

". "

\n". "{title}."\">\n". "{email}."\">\n". "{name}."\">\n". "\n"; my $x; for ($x=0;$x<$vars->{number};$x++) { print "\n"; } print "

". "Don't worry about leaving some spaces blank; I will ignore blank spaces.

". "Also, please note that since I am downloading information off the web, ". "the next page may take a few seconds to load.

\n

"; } sub create_file { my $vars = shift; unless ($vars->{xml}) { print "Sorry, I can't make an OPML site with zero links in it.

"; return; } print "

Yowza!! OK, let's see if I can't create a nice OPML file for you. Let's check those XML files...

    "; my $opstr = "\n" . " \n".$vars->{title}."\n" . " " . localtime(time) . "\n" . " " . localtime(time) . "\n" . " ".$vars->{name}."\n" . " ".$vars->{email}."\n" . " 1,3,17\n" . " 1\n" . " 164\n" . " 50\n" . " 672\n" . " 455\n" . " \n"; # print $opstr; my @xml_sites = split /\0/,$vars->{xml}; foreach my $xml_file (@xml_sites) { next unless ($xml_file); print "
  • $xml_file "; my $site_result = &get_rss($xml_file); if ($site_result eq "zero") { print "The site is not answering my request.
    "; next; } my ($parse_result,$line) = &parse_rss($site_xml,$site_result); unless ($parse_result eq "complete") { print "An error was encountered in the XML.
    "; next; } $opstr .= $line ."\n"; print "OK
    "; } $opstr .= " \n\n"; # Saviong OPML File my $opm = time . ".opml"; my $opmlfile = $ENV{'DOCUMENT_ROOT'} . "/opml/" . $opm; open OPML,">$opmlfile" || print "Error opening $opmlfile

    "; print OPML $opstr || print "Error saving $opmlfile

    "; close OPML; print "

Success!

Your OPML file has been created and saved at http://www.downes.ca/opml/".$opm."

"; print "You can save this file by right-clicking on the file name above and saving it to disk. Then you can place it on your webserver. ". "Place the url of your new OPML file into the form here. ". "If you do not have a website, simply copy the URL http://www.downes.ca/opml/".$opm." and place this". "into the form here" . "

Here is the full text of your OPML file. As you can see, any bad input (indicated in red) has not been included in the file.

". "

"; } sub get_rss { my $url = shift; $current_uri = $url; return "zero" unless ($url); my $ua = new LWP::UserAgent; $ua->agent("Edu_RSS/0.1 " . $ua->agent); $ua -> timeout(15); my $req = HTTP::Request->new(GET => $url); $req->content_type('application/x-www-form-urlencoded'); $req->content('match=www&errors=0'); my $res = $ua->request($req); if (!$res->is_success) { return "zero"; } my $webpage = $res->content; if ($webpage eq "") { return "zero"; } $debug_flag=0; $webpage = &clean_xml($webpage); my $new_file = "cache/" . &make_filename($url); open OUT,">$new_file" or print "Error opening $new_file: $!

"; print OUT "$webpage"; close OUT; return $new_file; } sub make_filename { my $url = shift; $url =~ s/http:\/\///mig; $url =~ s/\//_/mig; return $url; } # # clean_xml # # Cleans the garbage produced by commercial RSS generators that can't be # bothered to produce valid XML *sigh* # sub clean_xml { my $webpage = shift; $webpage =~ s/&/&/sig; # Because Blogger leaves & in URLs $webpage =~ s/&lt;/</sig; # Fixes perfecty valid but not gibbled $webpage =~ s/&gt;/>/sig; # caused by the previous line $webpage =~ s/&apos;/'/sig; $webpage =~ s/&quot;/"/sig; $webpage =~ s/'/'/sig; # Apostraphes, if you can believe it $webpage =~ s/\x27/'/mig; # Map strategically incompatible non-ISO characters in the # range 0x82 -- 0x9F into plausible substitutes where # possible. Caused by Microsoft editors used by, eg., Blogger # Thanks to demoronizer at http://www.fourmilab.ch/webtools/demoroniser/ $webpage =~ s/\x82/,/sig; $webpage =~ s-\x83-f-sig; $webpage =~ s/\x84/,,/sig; $webpage =~ s/\x85/.../sig; $webpage =~ s/\x88/^/sig; $webpage =~ s-\x89- °/°°-sig; $webpage =~ s/\x8B/~-sig; $webpage =~ s-\x99-TM-sig; $webpage =~ s/\x9B/>/sig; $webpage =~ s/\x9C/oe/sig; my $i; my $iline; my $oline; $webpage =~ s/[\x00-\x08\x10-\x1F\x80-\x9F]//sig; $webpage =~ s/â//sig; # Seems to be left over after the previous return $webpage; } sub parse_rss { my ($uri,$new_file) = @_; unless ($new_file) { return "Error: no rss url supplied.

"; } # Validate XML my $saxTester = SaxTester->new(); my $parser = XML::Parser::PerlSAX->new(Handler=>$saxTester); my $inputXmlFile = "$new_file"; my %parser_args = (Source => {SystemId => $inputXmlFile}); eval { $parser->parse(%parser_args); }; if ($@) { &error_report($uri,$@); return; } # Parse XML my $saxHandler = SaxHandler->new(); my $parser = XML::Parser::PerlSAX->new(Handler=>$saxHandler); my $inputXmlFile = "$new_file"; my %parser_args = (Source => {SystemId => $inputXmlFile}); $parser->parse(%parser_args); return "complete",$saxHandler->gimme(); } sub error_report { my ($errfile,$error_report_hash) = @_; # print "Yikes! I encountered an XML error in $errfile. Here's what I got:

    "; my ($error_line) = split /,/,$error_report_hash; #print "
  • $error_line

    "; # print "

Please check the XML source and try again.

"; } package SaxHandler; use strict; my $recording; my $current_element; my %record = {}; my %rss = {}; my %rss_data={}; my $opmldata; my $report; my $fieldname; my $fa; my @my_stack; %rss_data = {}; sub new { my $type=shift; return bless {}, $type; } sub gimme { return $opmldata; } sub start_document { my ($self) = @_; $channel{'feedkey'} = time; $channel{'uri'} = $current_uri; # print "A new channel ID has been created: $channel{'feedkey'}. Now I will save the following ", # "channel data for $current_uri:

"; } sub start_element { my ($self,$element) = @_; my %atts = %{$element->{Attributes}}; my $numAtts = keys(%atts); $current_element = $element->{Name}; my $element_name = lc($current_element); push @my_stack,$element_name; if ((&index_of("channel",@my_stack) >= 0) && (&index_of("item",@my_stack) == -1) && (&index_of("items",@my_stack) == -1)) { my $fieldstr = join "",@my_stack; ($fa,$fieldname) = split "channel",$fieldstr; } if ($numAtts > 0) { my ($thisAtt, $key, $val); for my $key (keys %atts) { $val = $atts{$key}; push @my_stack,$key; if ((&index_of("channel",@my_stack) >= 0) && (&index_of("item",@my_stack) == -1) && (&index_of("items",@my_stack) == -1)) { my $fieldstr = join "",@my_stack; ($fa,$fieldname) = split "channel",$fieldstr; $fieldname =~ s/rdf:/:/; $fieldname =~ s/^://; $channel{$fieldname} = $val; } pop @my_stack; } } } sub characters { my ($self,$character_data) = @_; my $text = $character_data->{Data}; if ((&index_of("channel",@my_stack) >= 0) && (&index_of("item",@my_stack) == -1) && (&index_of("items",@my_stack) == -1)) { $channel{$fieldname} .= $text; } } sub end_element { my ($self,$element) = @_; if ((&index_of("channel",@my_stack) >= 0) && (&index_of("item",@my_stack) == -1) && (&index_of("items",@my_stack) == -1)) { # print "$fieldname: $channel{$fieldname}

"; } my $temp = pop @my_stack; } sub end_document { my ($self) = @_; &save_channel_info; } sub print_channel_info { } sub save_channel_info { while (my($cx,$cy) = each %channel) { $channel{$cx} =~ s/\n|\r|\t/ /mig; $channel{$cx} =~ s/\s+/ /mig; $channel{$cx} =~ s/^\s+//; $channel{$cx} =~ s/\s$//mig; next unless ($channel{$cx}); # print CH_OUT "$channel{'id'}\t[schema]\t$cx\t$cy\n"; # print "$channel{'id'}[schema]", # "$cx$cy"; } $opmldata = "\n"; # print "

That's all the information I could find. This channel is now saved and will " . # "be harvested on a regular basis. Thank you for your submission."; } sub index_of { my $item = shift @_; my @arr = @_; # Get item and array from input my $index_count = 0; # Initialize counter foreach my $items (@arr) { # For each item in the array if ($item eq $items) { return $index_count; } # Return the counter value if it matches item $index_count++; # Increment the counter } return "-1"; # Return -1 if no match is returned } # -------------------------------------------------------------- package SaxTester; use strict; my $current_element; my %record = {}; my %channel = {}; my $report; my @my_stack; sub new { my $type=shift; return bless {}, $type; } sub start_element { my ($self,$element) = @_; } sub characters { my ($self,$character_data) = @_; my $text = $character_data->{Data}; if ($debug_flag) { print "$text |
"; } } sub end_element { my ($self,$element) = @_; } sub start_document { my ($self) = @_; } sub end_document { my ($self) = @_; }