#!/usr/bin/perl -w =pod =head1 NAME tv_grab_is - Grab TV listings for Iceland. =head1 SYNOPSIS tv_grab_is --help tv_grab_is [--config-file FILE] --configure [--gui OPTION] tv_grab_is [--config-file FILE] [--output FILE] [--days N] [--offset N] [--quiet] tv_grab_is --capabilities tv_grab_is --version =head1 DESCRIPTION Output TV listings for several channels available in Iceland. The data comes from www.skjarinn.is and stod2.is. The grabber relies on parsing HTML so it might stop working at any time. First run B to choose, which channels you want to download. Then running B with no arguments will output listings in XML format to standard output. B<--configure> Prompt for which channels, and write the configuration file. B<--config-file FILE> Set the name of the configuration file, the default is B<~/.xmltv/tv_grab_is.conf>. This is the file written by B<--configure> and read when grabbing. B<--gui OPTION> Use this option to enable a graphical interface to be used. OPTION may be 'Tk', or left blank for the best available choice. Additional allowed values of OPTION are 'Term' for normal terminal output (default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar. B<--output FILE> Write to FILE rather than standard output. B<--days N> Grab N days. The default is as many as the source carries. B<--offset N> Start N days in the future. The default is to start from today. B<--quiet> Suppress the progress messages normally written to standard error. B<--capabilities> Show which capabilities the grabber supports. For more information, see L B<--version> Show the version of the grabber. B<--help> Print a help message and exit. =head1 SEE ALSO L. =head1 AUTHOR Tómas Edwardsson (tommi@tommi.org), based on Yngvi Þór Sigurjónsson (yngvi@teymi.is) version that fetched info from sjonvarp.is. Heavily based on tv_grab_dk by Jesper Skov (jskov@zoftcorp.dk). tv_grab_dk originally based on tv_grab_nl by Guido Diepen and Ed Avis (ed@membled.com) and tv_grab_fi by Matti Airas. =head1 BUGS First release. Fails to recognize actors when actors names are abbrevated. =cut use strict; use XMLTV::Version '$Id: tv_grab_is,v 1.8 2006/04/12 08:19:16 fgouget Exp $ '; use XMLTV::Capabilities qw/baseline manualconfig cache/; use XMLTV::Description 'Iceland'; use Getopt::Long; use HTML::TreeBuilder; use HTML::Entities; # parse entities use IO::File; use URI; use Encode qw(from_to); use Date::Manip; use XMLTV; use XMLTV::Memoize; use XMLTV::ProgressBar; use XMLTV::Ask; use XMLTV::Mode; use XMLTV::Config_file; use XMLTV::DST; use XMLTV::Get_nice; use XMLTV::Date; use XMLTV::Usage < \$opt_days, 'offset=i' => \$opt_offset, 'help' => \$opt_help, 'configure' => \$opt_configure, 'config-file=s' => \$opt_config_file, 'gui:s' => \$opt_gui, 'output=s' => \$opt_output, 'quiet' => \$opt_quiet, 'list-channels' => \$opt_list_channels, ) or usage(0); die 'number of days must not be negative' if (defined $opt_days && $opt_days < 0); usage(1) if $opt_help; XMLTV::Ask::init($opt_gui); my $mode = XMLTV::Mode::mode('grab', # default $opt_configure => 'configure', $opt_list_channels => 'list-channels', ); # File that stores which channels to download. my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_is', $opt_quiet); if ($mode eq 'configure') { XMLTV::Config_file::check_no_overwrite($config_file); open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; # find list of available channels my $bar = new XMLTV::ProgressBar('getting list of channels', 1) if not $opt_quiet; my %channels = get_channels(); die 'no channels could be found' if (scalar(keys(%channels)) == 0); update $bar if not $opt_quiet; $bar->finish() if not $opt_quiet; my @chs = sort keys %channels; my @names = map { $channels{$_} } @chs; my @qs = map { "add channel $_?" } @names; my @want = ask_many_boolean(1, @qs); foreach (@chs) { my $w = shift @want; warn("cannot read input, stopping channel questions"), last if not defined $w; # No need to print to user - XMLTV::Ask is verbose enough. # Print a config line, but comment it out if channel not wanted. print CONF '#' if not $w; my $name = shift @names; print CONF "channel $_ $name\n"; # TODO don't store display-name in config file. } close CONF or warn "cannot close $config_file: $!"; say("Finished configuration."); exit(); } # Not configuring, we will need to write some output. die if $mode ne 'grab' and $mode ne 'list-channels'; # If we are grabbing, check we can read the config file before doing # anything else. # my @config_lines; if ($mode eq 'grab') { @config_lines = XMLTV::Config_file::read_lines($config_file); } my %w_args; if (defined $opt_output) { my $fh = new IO::File(">$opt_output"); die "cannot write to $opt_output: $!" if not defined $fh; $w_args{OUTPUT} = $fh; } $w_args{encoding} = 'UTF-8'; my $writer = new XMLTV::Writer(%w_args); # TODO: standardize these things between grabbers. $writer->start ({ 'source-info-url' => 'http://www.skjarinn.is/', 'source-data-url' => 'http://www.skjarinn.is/tv', 'generator-info-name' => 'XMLTV', 'generator-info-url' => 'http://membled.com/work/apps/xmltv/', }); if ($opt_list_channels) { my $bar = new XMLTV::ProgressBar('getting list of channels', 1) if not $opt_quiet; my %channels = get_channels(); die 'no channels could be found' if (scalar(keys(%channels)) == 0); update $bar if not $opt_quiet; foreach my $ch_did (sort(keys %channels)) { my $ch_name = $channels{$ch_did}; my $ch_xid = "$ch_did"; $writer->write_channel({ id => $ch_xid, 'display-name' => [ [ $ch_name ] ], 'icon' => [{'src' => get_icon($ch_did)}] }); } $bar->finish() if not $opt_quiet; $writer->end(); exit(); } # Not configuring or writing channels, must be grabbing listings. die if $mode ne 'grab'; my (%channels, @channels, $ch_did, $ch_name); my $line_num = 1; foreach (@config_lines) { ++ $line_num; next if not defined; # FIXME channel data should be read from the site, and then the # config file only gives the XMLTV ids that are interesting. # if (/^channel:?\s+(\S+)\s+([^\#]+)/) { $ch_did = $1; $ch_name = $2; $ch_name =~ s/\s*$//; push @channels, $ch_did; $channels{$ch_did} = $ch_name; } else { warn "$config_file:$.: bad line\n"; } } ###################################################################### # begin main program my $now = parse_date('now'); die if not defined $now; Date_Init('TZ=UTC'); my @to_get; # the order in which we fetch the channels matters my $today = UnixDate($now, '%Y-%m-%d'); die if not defined $today; my %chan_cache; foreach $ch_did (@channels) { $ch_name = $channels{$ch_did}; my $ch_xid = "$ch_did"; $writer->write_channel({ id => $ch_xid, 'display-name' => [ [ $ch_name ] ], 'icon' => [{'src' => get_icon($ch_did)}] }); if ($ch_did =~ /^(.+?)\.stod2.is$/) { cache_channel(\%chan_cache, $1, $opt_days, $opt_offset); } for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) { # Request day when constructing URL since it is represented as # an integere offset from today. Still pass in the computed # date - may need it sometime... my $day = UnixDate(DateCalc($today, "+ $i days"), '%Y-%m-%d'); t "turned offset $i (from $today) into date $day"; push @to_get, [ $i, $day, $ch_xid, $ch_did ]; } } my %warned_ch_name; # suppress duplicate warnings my $bar = new XMLTV::ProgressBar('fetching data', scalar @to_get) if not $opt_quiet; my @to_get_detailed; my $num_detailed = 0; foreach (@to_get) { my ($tv2date, $date, $ch_xmltv_id, $ch_tvgids_id) = @$_; t "going to get $ch_xmltv_id for $date"; process_listings_page($writer, $ch_xmltv_id, $tv2date, $ch_tvgids_id , $date); update $bar if not $opt_quiet; } $bar->finish() if not $opt_quiet; $writer->end(); ###################################################################### # subroutine definitions # arguments: # XMLTV::Writer object to write to # XMLTV id of channel # URL to fetch # Date::Manip object giving day for programmes in page (at least # until they cross midnight) sub cache_channel( $$$$ ) { my ($chan_cache, $ch_id, $days, $offset) = @_; my $num_days = $days + $offset; my $url = "http://old.stod2.is/oracledata/dagskratextaskra.asp?midill=$ch_id&dagar=$num_days"; my $data = get_nice($url); from_to($data, 'iso-8859-15', 'utf-8'); $data =~ s/^.*?\r\n.*?\r\n\r\n\r\n\r\n//s; my @inf = split(/\r\n\r\n/, $data); my $status = 0; my $mon = my $mday = my $year = 0; my $date = ''; my $last_hour = 0; foreach my $section (@inf) { if ($status == 0) { last if ($section =~ //); ($mon, $mday, $year) = split(/[\/ ]/, $section); print $section; $date = UnixDate(ParseDate("$mon/$mday/$year"), '%Y-%m-%d'); $status = 1; } else { my $time = my $title = my $title_is = my $desc = my $episode = ''; $status = 0; while ($section =~ s/^(\d\d:\d\d) (.+?)\r\n(\d+:\d+) /$3 /s) { #|| $section =~ s/^(\d\d:\d\d) (.+?)\r\n(.+?)\r\n(\d+:\d+) /$4 /s) { $title = ''; $title_is = ''; $desc = ''; $time = $1; my $hour = (split(':', $time))[0]; if ($hour < $last_hour) { my @d = split("-", $date); $date = UnixDate(ParseDate(DateCalc(ParseDate("$d[1]/$d[2]/$d[0]"), " + 1 day")), '%Y-%m-%d'); } $last_hour = $hour; my $con = $2; my $continue = $3; if ($con =~ /^(.+?)\r\n(.+)$/s) { $title = "$1"; $desc = $2; } else { $title = $con; } #print "\n\nTITLE:$title\n\n"; if ($continue) { my @d = split(/\r\n/, $desc); if ($d[1]) { if ($d[0] =~ /^\((.+)\)$/) { $title_is = $1; } else { $title .= " $d[0]"; } $desc = $d[1]; } else { $desc = $d[0]; } if ($title_is) { push @{$chan_cache->{$ch_id}->{$date}}, { time => $time, title => $title, title_is => $title_is, desc => $desc }; } else { push @{$chan_cache->{$ch_id}->{$date}}, { time => $time, title => $title, desc => $desc }; } } else { push @{$chan_cache->{$ch_id}->{$date}}, { time => $time, title => $title }; } } $section =~ /^(\d+:\d+) (.+?)(\r\n.*?|)$/s; $title_is = ''; $title = ''; $desc = ''; $time = $1; $title = $2; my @d = split(/\r\n/, $3); if ($d[1]) { if ($d[0] =~ /^\((.+)\)$/) { $title_is = $1; } else { $title .= " $d[0]"; } $desc = $d[1]; } else { $desc = $d[0]; } if ($title_is) { push @{$chan_cache->{$ch_id}->{$date}}, { time => $time, title => $title, title_is => $title_is, desc => $desc }; } else { push @{$chan_cache->{$ch_id}->{$date}}, { time => $time, title => $title, desc => $desc }; } } } } my ($warned_discarding_parts, $commented_episode_num); sub process_listings_page ( $$$$$ ){ # local $Log::TraceMessages::On = 1; my ($writer, $ch_xmltv_id, $tv2date, $tv2chan, $day_obj) = @_; my $next_day = 0; my $day = UnixDate($day_obj, '%Q'); my $laststart=$day_obj; #my $dayOffset = 0; t "getting channel $ch_xmltv_id, date $day"; my ($y,$m,$d) = UnixDate($day_obj,"%Y","%m","%d"); if ($ch_xmltv_id =~ /^(.+?)\.skjarinn\.is$/) { my $chid = $1; # We make an HTML::TreeBuilder object, get the information # from it and them delete it. # my $t = new HTML::TreeBuilder(); $t->parse(get_nice("http://www.skjarinn.is/skjarheimur/dagskra/?portlet_action=true&action_wid=928&rp_928_tvStationId=$chid&rp_928_WeekDaySelected=$d.$m")); #$t->parse_file("mToday.php"); #$t->dump(); my @rows = $t->look_down("_tag" => "table", "class" => "tvStationsProgram"); foreach my $row (@rows) { #$row->dump; my @lines= $row->look_down("_tag" => "tr"); shift(@lines); #shift(@lines) if ($ch_xmltv_id eq 'RUV.skjarinn.is'); while (my $line = shift (@lines)) { last if ($line->look_down("_tag" => "label")); # Process the list of [ heading, data ] pairs. my ($start, $stop, # exactly one $title, $sub_title, $genre, $date, # $episode_num, $actors, $writers, $commentators, # $director, $previously_shown, $orig_title, # at most one $aspect, $colour, $stereo, $texted, @presenter, @url # zero or more ); my $time = $line->look_down("_tag" => "td", "class" => "time")->as_text; $title = $line->look_down("_tag" => "td", "class" => "name")->as_text; next if ((split(":", $time))[0] < 4 && length($laststart) < 12); from_to($title, 'iso-8859-15', 'utf-8'); my $desc = $line->look_down("_tag" => "td", "class" => "description")->as_text; from_to($desc, 'iso-8859-15', 'utf-8'); $start = parse_local_date("$day $time",$TZ); # Try to detect if we have crossed midnigth #printf("START: %s LASTSTART: %s\n", $start, $laststart); if ( Date_Cmp($start , $laststart) < 0 ) { $start = DateCalc($start,"+ 1 day"); } $laststart = $start; my ($start_base, $start_tz) = @{date_to_local($start, $TZ)}; $episode_num="$1/$2" if ($title =~ s/\s*\((\d+):(\d+)\)//); $title =~ s/\s*\(e\)//; # remove rerun indication my @title_data = ([ $title, $LANG]); if($desc =~ s/Leikstjóri(:|)\s*([^.]*)\.//i ) { $director = $2; } if($desc =~ s/\s*(Aðalhlutverk:|[mM]eðal leikenda eru|Aðalhlutverk leika|[íÍ] aðalhulverkum eru|[Ll]eikendur eru)\s*([^.]*)\.//i ) { my @a = split(/, | og /, $2); s/[.]$// foreach @a; push @$actors, @a; } my %prog = (channel => $ch_xmltv_id, title => \@title_data, start => UnixDate($start_base, '%q') . " $start_tz", ); if ($desc) { $prog{desc} = [ [ $desc ,$LANG ] ]; } else { $prog{desc} = [ [ $title ,$LANG ] ]; } $prog{'episode-num'} = [ [ $episode_num,'xmltv_ns' ] ] if $episode_num; my %c; $c{actor} = $actors if $actors; $c{writer} = $writers if $writers; $c{commentator} = $commentators if $commentators; $c{director} = [ $director ] if $director; $prog{credits} = \%c if %c; $writer->write_programme(\%prog); } } #$t->delete; undef $t; } elsif ($ch_xmltv_id =~ /^(.+?)\.stod2\.is$/) { foreach my $r (@{$chan_cache{$1}->{$day_obj}}) { my ($start, $stop, # exactly one $title, $sub_title, $genre, $date, # $episode_num, $actors, $writers, $commentators, # $director, $previously_shown, $orig_title, # at most one $aspect, $colour, $stereo, $texted, @presenter, @url # zero or more ); $title = $r->{title}; #print "TITLE: $title\n"; my $time = $r->{time}; my $desc = $r->{desc}; $start = parse_local_date("$day $time",$TZ); # Try to detect if we have crossed midnigth #printf("START: %s LASTSTART: %s\n", $start, $laststart); if ( Date_Cmp($start , $laststart) < 0 ) { $start = DateCalc($start,"+ 1 day"); } $laststart = $start; my ($start_base, $start_tz) = @{date_to_local($start, $TZ)}; if ($title =~ s/\s*\((\d+):(\d+)\)//) { $episode_num="$1/$2"; } my @title_data = (); $title =~ s/\s*\(e\)//; # remove rerun indication if ($r->{title_is}) { @title_data = ([ $title, 'en'], [ $r->{title_is}, 'is']); } else { @title_data = ([ $title, $LANG]); } if($desc && $desc =~ s/Leikstjóri(:|)\s*([^.]*)\.//i ) { $director = $2; } if($desc && $desc =~ s/\s*(Aðalhlutverk:|[mM]eðal leikenda eru|Aðalhlutverk leika|[íÍ] aðalhulverkum eru|[Ll]eikendur eru)\s*([^.]*)\.//i ) { my @a = split(/, | og /, $2); s/[.]$// foreach @a; push @$actors, @a; } my %prog = (channel => $ch_xmltv_id, title => \@title_data, start => UnixDate($start_base, '%q') . " $start_tz", ); if ($desc) { $prog{desc} = [ [ $desc ,$LANG ] ]; } else { $prog{desc} = [ [ $title ,$LANG ] ]; } $prog{'episode-num'} = [ [ $episode_num,'xmltv_ns' ] ] if $episode_num; my %c; $c{actor} = $actors if $actors; $c{writer} = $writers if $writers; $c{commentator} = $commentators if $commentators; $c{director} = [ $director ] if $director; $prog{credits} = \%c if %c; $writer->write_programme(\%prog); } } } sub get_channels { my %channels; get_channels_visir(\%channels); get_channels_skjarinn(\%channels); return %channels; } # get channel listing sub get_channels_skjarinn { my $channels = shift; my $url = 'http://www.skjarinn.is/skjarheimur/dagskra/'; my $t = new HTML::TreeBuilder(); $t->parse(get_nice($url)); # The channels and their IDs are set in an select my @elements = $t->look_down("name" => "tvStationId"); foreach my $i (@elements) { my @chan = $i->look_down("_tag"=>"option"); foreach my $a (@chan) { next if (!$a->attr('value')); my $channel = $a->attr('value'); my $name = $a->as_text; $name =~ s/^(\s*)//g; $name =~ s/(\s*)$//g; $channel .= ".skjarinn.is"; from_to($channel, 'iso-8859-15', 'utf-8'); from_to($name, 'iso-8859-15', 'utf-8'); $channels->{$channel}=$name . " frá Skjánum"; } } $t->delete(); undef $t; } # get channel listing sub get_channels_visir { my $channels = shift; my $url = 'http://stod2.visir.is/?PageID=148'; my $t = new HTML::TreeBuilder(); $t->parse(get_nice($url)); # The channels and their IDs are set in an select $t->look_down("_tag" => "div", "id" => "colwrap"); my @a = $t->look_down("_tag" => "a"); foreach my $i (@a) { last if (!$i->attr('href')); next if ($i->attr('href') !~ /oracledata\/dagskratextaskra.asp/); #print $i->attr('href') . "\n"; my $channel = $i->attr('href'); $channel =~ s/^.*midill=(.+?)(&|$).*$/$1/; next if (exists $channels->{$channel}); $channel .= ".stod2.is"; my $name = $i->as_text; #print "$id - $name\n"; #from_to($channel, 'iso-8859-15', 'utf-8'); #from_to($name, 'iso-8859-15', 'utf-8'); $channels->{$channel}=$name . " frá Stöð 2"; } $t->delete(); undef $t; } # Icon URL for a given channel. sub get_icon { my ($url) = @_; if ($url =~ /^(.+?)\.stod2\.is$/) { my $chan = $1; $chan =~ s/^SYN2$/SYN/; return "http://stod2.visir.is/images/stod2/stodvar_logo/new/$chan" . "_logo_sel.gif"; } elsif ($url =~ /^(.+?)\.skjarinn\.is$/) { return "http://www.skjarinn.is/other_files/skjarinn/img/$1.gif"; } else { return ""; } }