#!/bin/tcsh
## grab-spamcop
## get spamcop URI and minutes ago data and store in flat text db
##
## Revision history:
## 1.00 initial
## 1.01 fold case to lower, handle http, https, ftp URIs 2/26/04 -- Jeff Chan
## 1.02 add sort -u of current data for the domain parser 2/27/04 -- Jeff Chan
## 1.03 moved sed hyperlink patterns to external files 2/27/04 -- Jeff Chan
## 1.04 changed from head to time-based expire of old data 2/29/04 -- Jeff Chan
## 1.05 removed unused time computation, csh math integer 3/1/04 -- Jeff Chan
## 1.06 added sed file to remove illegal domain characters 4/1/04 -- Jeff Chan
## 1.07 added -a flag to greps to treat all as text 5/3/04 -- Jeff Chan
## grab current spamcop URLs
/usr/local/bin/lwp-request "http://www.spamcop.net/w3m?action=inprogress&type=www" > spamcop.raw
## get current UTC time, expressed in seconds since epoch
@ now = `./epoch_seconds_utc`
## extract "minutes ago" data from raw spamcop page
## fold to lower case first since sed can't do case insenitivity
/usr/bin/tr "[:upper:]" "[:lower:]" < spamcop.raw | /usr/bin/egrep -a "^
.*min. | " | /usr/bin/sed -e "s///" -e "s/ min.<\/td>//" > spamcop.minute
## extract URLs and fully qualified domain names from raw spamcop page
## fold to lower case first since sed can't do case insenitivity
## handle http, https, ftp and malformed URIs lacking the ":"
/usr/bin/tr "[:upper:]" "[:lower:]" < spamcop.raw | /usr/bin/fgrep -a ' spamcop.fqdn
## create file of calculated timestamps based on current time and "minutes ago"
echo > spamcop.timestamp
foreach minutes_ago (`cat spamcop.minute`)
## accumulate timestamps with minutes ago (in seconds) subtracted from "now"
/bin/date -u -r `echo "$minutes_ago 60 * $now r - p" | /usr/bin/dc` "+%Y-%m-%d% %H:%M" >> spamcop.timestamp
end
## paste together latest URLs and timestamps, ditto FQDNs
## and create uniqued versions while we're at it
/usr/bin/paste spamcop.timestamp spamcop.uri | /usr/bin/tee spamcop.current.uri | /usr/bin/sort -u -r > spamcop.current.uri.uniq
/usr/bin/paste spamcop.timestamp spamcop.fqdn | /usr/bin/tee spamcop.current.fqdn | /usr/bin/sort -u -r > spamcop.current.fqdn.uniq
## touch the history files to force them to exist before sorting
/usr/bin/touch spamcop.history.uri spamcop.history.fqdn
## merge with results of old reports
## requires accurate calculated timestamps, jitter due to delays will confuse
## perhaps make it less sensitive to timing by using only the minutes
## date calculation changed to minute resolution only, seconds don't matter
## this type of sort and uniq will discard duplicated spam sites that
## also happen to occur in the same minute
## this is imperfect but fully workable since no unique sites are lost.
## in other words, all the spam sites are available, but quantity lost.
## discard records older then 1 day
/usr/bin/sort -u -r spamcop.current.uri spamcop.history.uri | ./expire-records 86400 > spamcop.new.uri
/usr/bin/sort -u -r spamcop.current.fqdn spamcop.history.fqdn | ./expire-records 86400 > spamcop.new.fqdn
## replace old history with most recent reports
/bin/cp -f spamcop.new.uri spamcop.history.uri
/bin/cp -f spamcop.new.fqdn spamcop.history.fqdn
## count the number of lines in the history files to generate percentages later
/usr/bin/wc -l < spamcop.history.uri > spamcop.history.uri.linecount
/usr/bin/wc -l < spamcop.history.fqdn > spamcop.history.fqdn.linecount
|