#!/local/bin/perl # showdead.pl Daniel MacKay Daniel.MacKay@Dal.Ca # 990922 DEM Scan the log from a "htdig -s" run and produce pages listing # all the dead links for your web managers to browse. # dig_report.pl a variant by Malcolm.Austen@oucs.ox.ac.uk # for use on daneel.ox.ac.uk # 990929-30 MDA first hack with a view to counting pages at each level # 991201 MDA tidy up the presentation # 000119 MDA exclude "not found" and "redirect" urls from the table # 0103.. MDA reordeed server lists by reverse IPname # added lists of all pages indexed (+hop counts) # Typical(?) usage: # perl dig_report.pl < htdig.stdout #use strict; $prefix = "db/new/report/" ; # that's where the html pages will be written $title = "ht://Dig report generated on " . localtime(); # this is the

for the html pages, could be better! # print "\n$title\n\n"; my ($maxd, $addr, $totcount, $badtot, @dcount, %bad, %servtot, %sites); $maxd = 1; while (<>) { chop; # print "$_|\n"; s/\s*$//; # strip trailing spaces # # count pages and depths # this patern matches the a "digits:digits:digits:serverURL" record # we want $3 (depth at which the page indexed) and $4 (server name) if ( m'^(\d+):(\d+):(\d+):http://(.*?)/(.*):' ) { next if ( /(not found$|redirect$)/ ); $totcount++; $servtot{$4}++; if ( $3 eq "255") { $badtot++; $bad{$4}++ } else { if ($3 > $maxd ) { $maxd = $3 }; $dcount[$3]++; $sites{$4}->[$3]++ }; # note away the URL and hopcount push( @hops, "$4\thttp://$4/$5\t" . sprintf( "%2d", $3 ) ); # next; }; # # this notes the pages that were not indexed (needs -s option on htdig) if (m/^Not found:\s+(.*) Ref: (.*)$/) { ($bad,$ref) = ($1,$2) ; $key = $ref ; # print "$_\n" ; $key =~ s/^http:\/\///; $key =~ s/\/.*$//; $key = join( ".", reverse split( /\./, $key )), "\n"; push(@bad,"$key\t$ref\t$bad") ; } } ; open (SERV,">$prefix/index.html") or die "can't open dead index file" ; chmod (0644,"$prefix/index.html"); open( SERVERNAMES, ">$prefix/server.names" ) or die "can't open names file" ; print SERV <ht://Dig report

$title

Jump forward to lists of pages that were not indexed.

This table shows the number of pages indexed at various depths from each server.
The depth is measured from (one of) the server start points. Some level counts may be misled by, for example, a high level page having already been indexed at a lower level from another server start point or by one server having serveral start point entries (with different start directories).
The "?" column gives the count for pages that had a dubious(unset?) depth in the log record.

HTML_1 # first line of table gives overall totals print SERV "\n"; print SERV "\n"; for ($i=0;$i<=$maxd;$i++) { print SERV "\n" }; print SERV "\n"; # second line of table is just the headers print SERV "\n"; print SERV "\n"; for ($i=0;$i<=$maxd;$i++) { print SERV "\n"; # need to build reverse site lookup table befor going on foreach $s ( keys %sites) { my $rev = join( ".", reverse split( /\./, $s)); $revsites{$rev}=$s; }; # then one line per server found in the log foreach $rs (sort keys %revsites) { my $s = $revsites{$rs}; print SERVERNAMES "$s\n"; print SERV "\n"; print SERV "\n"; print SERV "\n"; for ($i=0;$i<=$maxd;$i++) { print SERV "\n" }; print SERV "\n"; }; close(SERVERNAMES); print SERV "
Overall$totcount$badtot$dcount[$i]
Server\\DepthTotal?$i\n" }; print SERV "
$s$servtot{$s}$bad{$s}$sites{$s}->[$i]
\n"; # table of pages/depths is done, now present the lists of non-indexed pages print SERV <

These lists (one per server) show the pages that ht://Dig could not index. The page may be missing but it may simply be that the ht://Dig process was forbidden access to the page.

HTML_2 $okey = "" ; foreach $_ (sort(@bad)) { ($key,$ref,$bad) = split /\t/ ; $key = join( ".", reverse split( /\./, $key )); if ($okey ne $key) { print OUT "
\n"; close OUT ; if ( $okey ne "" ) { print SERV "$okey\n" ; print SERV "- $count pages could not be indexed\n"; }; $count=0; # print "Now writing to $key\n" ; open (OUT,">$prefix/ni_$key.html") || die "can't open file for $key" ; chmod (0644,"$prefix/ni_$key.html"); print OUT "Un-Indexed links on $key\n"; print OUT "

Un-Indexed links on $key

\n" ; print OUT "

$title

    \n" ; $okey = $key ; }; $count++; print OUT "
  • $bad
    $ref
  • \n" ; } ; if ( $okey ne "" ) { print SERV "$okey\n" ; print SERV "- $count pages could not be indexed\n"; }; print OUT "
\n"; close OUT; print SERV "\n" ; close SERV ; ## # that dealt with the main table and the not-indexed lists # now to dump out the lists of indexed pages, one per server # and ordered by hopcount ## $okey = "" ; foreach $_ (sort(@hops)) { ($key,$url,$hop) = split /\t/ ; if ($okey ne $key) { print OUT "\n"; close OUT ; open (OUT,">$prefix/hl_$key.html") || die "can't open file for $key" ; chmod (0644,"$prefix/hl_$key.html"); print OUT "Page & hop count list for $key\n"; print OUT "

Page & hop count list for $key

\n"; print OUT "

$title

\n"; print OUT "\n"; $okey = $key ; }; print OUT "\n" ; } ; print OUT "
HopCount URL
$hop$url
\n"; close OUT; # all done