#!/usr/bin/perl #------------------------------------------------------------------------------- # $Id: mw4.pl,v 1.3 2003/05/02 12:23:43 shevett Exp shevett $ #------------------------------------------------------------------------------- # mw.pl - mailwatch... uhh, watches mail, what else? # see http://www.stonekeep.com/other_projects.html # # 3/17/03 dbs - removed lee's email, changed URL for stonekeep. # 1/16/01 dbs - The 'hourly distribution' was using the wrong number # for the 'total number of posts', and therefore the percentages # were wrong. Fixed. # 1/08/01 dbs - Fixed some division by zero errors that happen on the # turn of the year. Rewrote the entire 'hourly distribution' # section to show a bar chart rather than just raw numbers. # 1/24/98 ls - verified output is correct. Made $postdate and $recdate # into the same variable (a questionable move!). It's possible # that if the log file is very long, errors occur. # 10/1/97 ls - Added support for multiple log files, file # format checking, pushed many options out to the command line, # made usage (-h) work correctly, got rid of that annoying "Odd # number of elements" error. Ignores first line of headers if # blank (Pine sticks an extra line on Exported mail). # added Yesterday feature # Changes made by Lee Sonko leeNOSPAMlee@lee.org # 2/12/97 des - fixed formatting error in top 'posts by day' line, # as well as problems with calculating the averatge # posts per day (it was doing 'em calculated total/6. eek!) # 3/31/96 des - Major changes. There was a bug in the initial scan # that may result in miscounted number of posts, due to # overwriting an element in a hash. Fixed. Reformatted # some stuff in the reports, and added a "accounting # for" calculation in the 'top' sections. # 2/19/96 des - zounds. trailing spaces in the subject lines showed # up as different entries in the listing. fixed. # 1/19/96 des - changed some problems with very low usage through # the logs. show interval in 'top 10' lists. # 1/11/96 des - revamped distribution, fixed average problem, # added 'period' to limit how much data to parse # 1/06/96 des - hourly distribution was showing '10's. fixed. # 1/03/96 des - unleashed upon the world. # # Here's a short explanation of what the output is: # munged "Date:" field down to minute: munged "From" field down to minute # number of lines in body: From: Subject use Time::Local; # from perl5 require "ctime.pl"; #-------------------------------------------------------------------- # some variables # $logfile="-"; # The default name of the log file $listname="gonnachange"; # a dummy value that will get changed $version="0.44"; # Version $testing=0; # Set to 1 to print to STDOUT when running (no log) $threshold=0; # Stop summaries when down to what value? (0 = all) $plimit=10; # Default maximum number of lines to print per summary $perioddays=7; # Default of how far back to report (days) $period= ($perioddays * 86400 ); # convert period to seconds $prog_name = $0; # Who I am $prog_name =~ s/^.*\/(.*)/$1/; # Keep only base name #-------------------------------------------------------------------- # some constants # $Months="JanFebMarAprMayJunJulAugSepOctNovDec"; #-------------------------------------------------------------------- # u s a g e - show da bums how ta do it. # sub usage { $period = ($period / 86400); print STDERR <) { $totallines++; print "< $_" if ($testing); chomp; # if the first line in the message is blank, ignore it if ($totallines eq 1) { if (/^$/) {next}; } if ( /^From / && ! $rfline ) { ($rfline) = ($_ =~ /(\w\w\w\s+\d+ \d+:\d+:\d+ \d\d\d\d)/) ; next; } if ( /^From: / && ! $fline ) { ($fline) = ($_ =~ /From: (.*)/) ; next; } if ( /^Subject: / && ! $sline ) { ($sline) = ($_ =~ /: (.*)/) ; next; } if ( /^Date: / && ! $dline ) { ($dline) = ($_ =~ /: (.*)/) ; next; } if ( /^$/ ) { $lcount=0; while (<>) { $lcount++; $inccount++ if (/^(:|>)/) ; } } } # # parse up date posted line... # Could be any one of: # Sat, 30 Dec 1995 00:33:29 -0500 (EST) # Fri, 22 Sep 1995 13:26 -0500 (EST) # Thu, 9 Nov 1995 14:50:54 -0400 (GMT-0400) # Sat, 30 Dec 95 0:05:27 EST # Tue, 03 Oct 1995 13:52:58 -0400 # Wed, 4 Oct 1995 10:03:20 -0500 # Thursday,November 02,1995 5:37PM # Thursday, November 02, 1995 3:22PM # 95-12-05 21:58:06 EST # 24 Dec 1995 22:28:22 EST # 16 Oct 95 23:58:57 EDT # print "Posted: $dline\n" if ($testing); ($_day,$_tmon,$_year,$_hour,$_min,$_sec) = ($dline =~ /(\d+)\s+(\w+)\s+(\d+)\s+(\d+):(\d+)((:\d+| ))/); $nmon=(index($Months,$_tmon) / 3); $_year=$_year-1900 if ($_year > 1900) ; print "dline: s: $_sec, m: $_min, h: $_hour, d: $_day, nm: $nmon, y: $_year\n" if ($testing); $postdate=&timelocal($_sec,$_min,$_hour,$_day,$nmon,$_year) ; # # parse up date received line... (Aug 25 13:05:53 1995) # Dec 1 12:41:18 1995 # ($_tmon,$_day,$_hour,$_min,$_sec,$_year) = ($rfline =~ /(\w\w\w)\s+(\d+)\s+(\d+):(\d+)(:\d+) (\d\d\d\d)/); $_sec=0 if (! $_sec); $nmon=(index($Months,$_tmon) / 3); $_year=$_year-1900; print "rfline: s: $_sec, m: $_min, h: $_hour, d: $_day, nm: $nmon, y: $_year\n" if ($testing); print "timelocal($_sec,$_min,$_hour,$_day,$nmon,$_year)" if ($testing); $recdate=&timelocal($_sec,$_min,$_hour,$_day,$nmon,$_year); print "opening logfile $logfile\n" if ($testing); open(LOG,">>$logfile") || die "Cannot open $logfile: $!\n"; print "writing\n" if ($testing); print "$postdate:$recdate:$lcount:$inccount:$fline:$sline\n" if ($testing); # Maybe I'm foolish but I don't see the use of having both $postdate and $recdate. # Some systems use one, and some the other. So, in a fit of kludginess, # I'm just going to record the From header twice - Lee Sonko 1/23/98 # print LOG "$postdate:$recdate:$lcount:$inccount:$fline:$sline\n"; print LOG "$recdate:$recdate:$lcount:$inccount:$fline:$sline\n"; close(LOG); print "written! \n" if ($testing); } sub numeric { $a <=> $b } #-------------------------------------------------------------------- # g e n h i s t o r y - figger out postings for the last coupla # days, number per day. # sub genhistory { my($_dline,$_vline,$m,$d,$y); $index=0; $hcount=0; $pcounter=0; $midnight=&calcmidnight(time); if ($yesterday) { $midnight=$midnight - 86400 } ($w,$m,$d) = (&ctime($midnight) =~ /(\w\w\w)\s+(\w+)\s+(\d+)\s+\d+:\d+:\d+ /); $_dline=sprintf("%4s %2d ",$m,$d); $_vline=""; # print "Midnight is $midnight\n"; for $i (reverse(sort keys %pdates)) { # print "Checking $i against $midnight for week $index "; if ($i < $midnight) { last if ($index == 6); # print "Yes\n"; $_vline="${_vline}".sprintf(" %-3.0f | ",$pcounter); $midnight=$midnight - 86400; $pcounter=1; $hcount++; $index++; ($w,$m,$d) = (&ctime($midnight) =~ /(\w\w\w)\s+(\w+)\s+(\d+)\s+\d+:\d+:\d+ /); $_dline="$_dline|".sprintf(" %3s %2d ",$m,$d); } else { # print "No\n"; $pcounter++; $hcount++; } } $_vline="${_vline}".sprintf(" %-3.0f",$pcounter); print "\nBreakdown by day: ($hcount posts, average of "; print sprintf("%3.1f",($hcount / $perioddays)); print " posts per day.)\n"; print "----------------------------------------------------------------------\n"; print "\t$_wline\n"; print "\t$_dline\n"; print "\t$_vline\n\n"; } #-------------------------------------------------------------------- # g e n d i s t - figger out the distribution of posting times # sub gendist { my($output,$i,$hour,@dist); for $i (0..23) { @dist[$i]=0 }; $max = 0; $htotal = 0; for $i (keys %pdates) { ($hour) = ($pdates{$i} =~ /\w\w\w\s+\w+\s+\d+\s+(\d+):\d+:\d+ /); @dist[$hour]++; $max = (@dist[$hour] > $max) ? @dist[$hour] : $max; $htotal++; } if ($max > 45) { $tic = 45 / $max; } else { $tic = 1; } $tictext = sprintf("%3.1f",$tic); print "\nHourly distribution of postings: (* = $tictext post(s), total of $htotal)\n"; print "----------------------------------------------------------------------\n"; $output=""; for $i (0..23) { $div = (@dist[$i] / $htotal) * 100; print "\t" . sprintf("%2s-%2s : %-4.1f%% [%3i] \t",$i,$i+1,$div,@dist[$i]) ; print "*" x int($tic * @dist[$i]); print "\n"; } print "\n"; } #-------------------------------------------------------------------- # s u m m a r i z e - give em da woiks. # sub summarize { local($pdate,$rdate,$poster,$subject); local($count,$posters,%pdates,%posters); $cutoff=calcmidnight(time - $period + 86400); %plist=(); $posterc=0; %slist=(); $subjectc=0; print "Cutoff $cutoff ($period seconds ago).\n" if ($testing); open(INP,"$logfile") || die "summarize: error opening $logfile: $!\n"; $lastmidnight = calcmidnight(time); while() { chop; ($pdate,$rdate,$lines,$inclines,$poster,$subject)=split(":",$_,6); if ($yesterday) { if ($pdate < $cutoff || $pdate >= $lastmidnight) { print "skipped $pdate by $poster\n" if ($testing); next; } } else { if ($pdate < $cutoff) { print "skipped $pdate by $poster\n" if ($testing); next; } } if (! $plist{$poster}) { $posterc++; $plist{$poster}=$poster; } if (! $slist{$subject}) { $subjectc++; $slist{$subject}=$subject; } print "processing $pdate...\n" if ($testing); $count++; chop($odate=$pdate>$rdate ? &ctime($pdate) : &ctime($rdate)) if (! "$odate"); $posters{$poster}=$posters{$poster} + 1; print "$posters{$poster} - $poster \n" if ($testing); $subject=~s/^\[[^:]*: (.*)]$/$1/; $subject=~s/^Re: (.*)$/$1/; $subject=~s/^RE: (.*)$/$1/; $subject=~s/^(.*)\s+$/$1/; $sublist{$subject}++; if ($pdates{$pdate}) { print "exists! $poster, $subject\n" if ($testing); $pdate="${pdate}.1"; } $pdates{$pdate}=&ctime($pdate); chop($ndate=$pdate > $rdate ? &ctime($pdate) : &ctime($rdate)); } close(INP); $interval=sprintf("%2.2f",($perioddays)); if ($yesterday) { chop($repinv=ctime($cutoff-86400)); } else { chop($repinv=ctime($cutoff)); } if ($yesterday) { $lastmidnight = calcmidnight(time) - 1; chop($repinvuntil = &ctime($lastmidnight)); } else { chop($repinvuntil = &ctime(time)); } print "$listname Traffic Report ".&ctime(time); print "======================================================================\n"; print "Report interval From--: $repinv ($interval days)\n"; print "Report interval Until-: $repinvuntil\n"; print "Oldest post ----------: $odate\n"; print "Most recent post -----: $ndate \n"; print "Total posts ----------: $count\n"; print "Total unique posters -: $posterc\n"; print "Total unique subjects : $subjectc\n"; print "======================================================================\n"; &genhistory(); for $i (sort keys %posters) { $perc = sprintf("%3.1f",($posters{$i} / $count * 100)); push(@parray,"$posters{$i} ($perc %)\t$i\n"); } $pcounter=0; $topcount = 0; for $i (reverse(sort numeric @parray)) { ($num,$text)=split("\t",$i); last if (($num == $threshold) && ($threshold)); last if ($pcounter == $plimit); @outarray[$pcounter++]=$i; $topcount=$topcount+$num; } $ptext=sprintf("%3.1f",($topcount / $count) * 100); print "Top $plimit Posters (Representing ${ptext}% of the total traffic.)\n"; print "----------------------------------------------------------------------\n"; for $i (@outarray) { print "\t$i"; } @parray=(); for $i (sort keys %sublist) { $perc = sprintf("%3.1f",($sublist{$i} / $count * 100)); push(@sarray,"$sublist{$i} ($perc %)\t$i\n"); } $pcounter=0; $topcount=0; for $i (reverse(sort numeric @sarray)) { ($num,$text)=split("\t",$i); last if (($num == $threshold) && ($threshold)); last if ($pcounter == $plimit); @outarray[$pcounter++]=$i; $topcount=$topcount+$num; } $ptext=sprintf("%3.1f",($topcount / $count) * 100); print "\nTop $plimit subjects (Representing ${ptext}% of the total traffic.)\n"; print "----------------------------------------------------------------------\n"; for $i (@outarray) { print "\t$i"; } &gendist(); print "======================================================================\n"; print "Mailwatch v$version by Shayde. http://www.stonekeep.com/\n"; print "with changes by Lee Sonko. http://www.lee.org\n"; } #-------------------------------------------------------------------- # m a i n - the naughty bits... # # Parse options while ($ARGV[0] =~ /^-/) { $_ = shift; last if /--/; if ($_ eq '-s') { $summarize = "yes";} elsif ($_ eq '-l') { $logfile = shift; # protect $logfile from being tainted by an outside variable if ($logfile eq "-"){$logfile = "-";} } elsif ($_ eq '-t') { $listname = shift} elsif ($_ eq '-h') { &usage} elsif ($_ eq '-d') { $perioddays = shift; $period= ($perioddays * 86400 )} elsif ($_ eq '-n') { $plimit = shift} elsif ($_ eq '-y') { $yesterday = "yes"} elsif ($_ eq '-v') { $testing = 1} elsif ($_ eq '-V') { print STDERR "\n"; print STDERR "$prog_name $version (c)2001 by Dave Belfer-Shevett\n"; print STDERR "==================================================\n"; print STDERR "Thanks to Lee Sonko for additional mods\n"; print STDERR "More information available at http://www.stonekeep.com/\n"; exit 0; } } # Verify that the file we're gonna append to is a mailwatch formatted file unless ($logfile eq "-"){ # We're not writing to stdout, go ahead... open (CHECK,"$logfile"); while () { $_ =~ /^(\d\d\d\d\d\d\d\d\d\d?)\:(\d\d\d\d\d\d\d\d\d\d?)\:(\d*)\:(\d*)\:(.*)\:(.*)/; # don't go for #6 b/c the Subject might be blank $errmsg="Format error on line $.\n"; $errmsg.="Text line is $_\n"; $errmsg.="$logfile isn\'t in $prog_name format. Exiting.\n"; die "$errmsg" unless ($5); } close (CHECK); } # the listname wasn't changed manually, do it automatically if ($listname eq "gonnachange") { $listname = $logfile; $listname =~ s/^(.*\/)*(.*)/$2/; # trim the pathing $listname =~ s/^(.*)\.log$/$1/; # trim the ".log" } if ($summarize eq "yes") { &summarize; exit 0; } else { &snarfinput; exit 0; }