#!/bin/tcsh ## grab-spamcop ## get spamcop URI and minutes ago data and store in flat text db ## ## Revision history: ## 1.00 initial ## 1.01 fold case to lower, handle http, https, ftp URIs 2/26/04 -- Jeff Chan ## 1.02 add sort -u of current data for the domain parser 2/27/04 -- Jeff Chan ## 1.03 moved sed hyperlink patterns to external files 2/27/04 -- Jeff Chan ## 1.04 changed from head to time-based expire of old data 2/29/04 -- Jeff Chan ## 1.05 removed unused time computation, csh math integer 3/1/04 -- Jeff Chan ## 1.06 added sed file to remove illegal domain characters 4/1/04 -- Jeff Chan ## 1.07 added -a flag to greps to treat all as text 5/3/04 -- Jeff Chan ## grab current spamcop URLs /usr/local/bin/lwp-request "http://www.spamcop.net/w3m?action=inprogress&type=www" > spamcop.raw ## get current UTC time, expressed in seconds since epoch @ now = `./epoch_seconds_utc` ## extract "minutes ago" data from raw spamcop page ## fold to lower case first since sed can't do case insenitivity /usr/bin/tr "[:upper:]" "[:lower:]" < spamcop.raw | /usr/bin/egrep -a "^.*min." | /usr/bin/sed -e "s///" -e "s/ min.<\/td>//" > spamcop.minute ## extract URLs and fully qualified domain names from raw spamcop page ## fold to lower case first since sed can't do case insenitivity ## handle http, https, ftp and malformed URIs lacking the ":" /usr/bin/tr "[:upper:]" "[:lower:]" < spamcop.raw | /usr/bin/fgrep -a ' spamcop.fqdn ## create file of calculated timestamps based on current time and "minutes ago" echo > spamcop.timestamp foreach minutes_ago (`cat spamcop.minute`) ## accumulate timestamps with minutes ago (in seconds) subtracted from "now" /bin/date -u -r `echo "$minutes_ago 60 * $now r - p" | /usr/bin/dc` "+%Y-%m-%d% %H:%M" >> spamcop.timestamp end ## paste together latest URLs and timestamps, ditto FQDNs ## and create uniqued versions while we're at it /usr/bin/paste spamcop.timestamp spamcop.uri | /usr/bin/tee spamcop.current.uri | /usr/bin/sort -u -r > spamcop.current.uri.uniq /usr/bin/paste spamcop.timestamp spamcop.fqdn | /usr/bin/tee spamcop.current.fqdn | /usr/bin/sort -u -r > spamcop.current.fqdn.uniq ## touch the history files to force them to exist before sorting /usr/bin/touch spamcop.history.uri spamcop.history.fqdn ## merge with results of old reports ## requires accurate calculated timestamps, jitter due to delays will confuse ## perhaps make it less sensitive to timing by using only the minutes ## date calculation changed to minute resolution only, seconds don't matter ## this type of sort and uniq will discard duplicated spam sites that ## also happen to occur in the same minute ## this is imperfect but fully workable since no unique sites are lost. ## in other words, all the spam sites are available, but quantity lost. ## discard records older then 1 day /usr/bin/sort -u -r spamcop.current.uri spamcop.history.uri | ./expire-records 86400 > spamcop.new.uri /usr/bin/sort -u -r spamcop.current.fqdn spamcop.history.fqdn | ./expire-records 86400 > spamcop.new.fqdn ## replace old history with most recent reports /bin/cp -f spamcop.new.uri spamcop.history.uri /bin/cp -f spamcop.new.fqdn spamcop.history.fqdn ## count the number of lines in the history files to generate percentages later /usr/bin/wc -l < spamcop.history.uri > spamcop.history.uri.linecount /usr/bin/wc -l < spamcop.history.fqdn > spamcop.history.fqdn.linecount