| 1 |
#!/bin/sh |
|---|
| 2 |
# TODO: check for minima before building indices |
|---|
| 3 |
|
|---|
| 4 |
. env.sh |
|---|
| 5 |
|
|---|
| 6 |
if [ "$#" -lt 2 ] |
|---|
| 7 |
then |
|---|
| 8 |
echo "usage: heuristics.sh (SPAMMERS | USERS) username [username] [username]..." |
|---|
| 9 |
exit 1 |
|---|
| 10 |
fi |
|---|
| 11 |
|
|---|
| 12 |
if [ "$1" != "SPAMMERS" -a "$1" != "USERS" ] |
|---|
| 13 |
then |
|---|
| 14 |
echo "usage: heuristics.sh (SPAMMERS | USERS) username [username] [username]..." |
|---|
| 15 |
exit 2 |
|---|
| 16 |
fi |
|---|
| 17 |
|
|---|
| 18 |
function output_xml_doc_start () { |
|---|
| 19 |
echo "<?xml version='1.0'?>" |
|---|
| 20 |
echo "<user_group type='$1'>" |
|---|
| 21 |
} |
|---|
| 22 |
|
|---|
| 23 |
function output_xml_doc_end () { |
|---|
| 24 |
echo "</user_group>" |
|---|
| 25 |
} |
|---|
| 26 |
|
|---|
| 27 |
function output_xml_usr_start () { |
|---|
| 28 |
echo " <user name='$1'>" |
|---|
| 29 |
echo " <heuristics" |
|---|
| 30 |
} |
|---|
| 31 |
|
|---|
| 32 |
function output_xml_usr_end () { |
|---|
| 33 |
echo " />" |
|---|
| 34 |
echo " </user>" |
|---|
| 35 |
} |
|---|
| 36 |
|
|---|
| 37 |
function output_xml_idx () { |
|---|
| 38 |
name=$1 |
|---|
| 39 |
value=$2 |
|---|
| 40 |
echo " $name='$value'" |
|---|
| 41 |
} |
|---|
| 42 |
|
|---|
| 43 |
output_xml_doc_start $1 |
|---|
| 44 |
shift |
|---|
| 45 |
USERS=$* |
|---|
| 46 |
|
|---|
| 47 |
# db auth: |
|---|
| 48 |
u=`get_database_user` |
|---|
| 49 |
db=`get_database_name` |
|---|
| 50 |
p=`get_database_password` |
|---|
| 51 |
|
|---|
| 52 |
for USER in $USERS |
|---|
| 53 |
do |
|---|
| 54 |
|
|---|
| 55 |
# number of links to examine: |
|---|
| 56 |
scope=10 |
|---|
| 57 |
|
|---|
| 58 |
double_hyphenated_links=`mysql -u $u -p$p $u -B -e "select count( url ) as 'double_hyphenated_links' from links where userid = '$USER' and url REGEXP '.+-.+-.+\\.com';" | tail -n1` |
|---|
| 59 |
known_spammer_domain_links=`mysql -u $u -p$p $u -B -e "select count( url ) as 'known_spammer_domain_links' from links where userid = '$USER' and url REGEXP '(ourbigtop|blogspot)\.com';" | tail -n1` |
|---|
| 60 |
dot_info_domains=`mysql -u $u -p$p $u -B -e "select count( url ) as 'dot_info_domains' from links where userid = '$USER' and url REGEXP '\\.info(%2F|$)';" | tail -n1` |
|---|
| 61 |
tags_w_commas=`mysql -u $u -p$p $u -B -e "select count( name ) as 'tags_w_commas' from tags where userid = '$USER' and name REGEXP ',';" | tail -n1` |
|---|
| 62 |
tags_w_multiple_spaces=`mysql -u $u -p$p $u -B -e "select count( name ) as 'tags_w_multiple_spaces' from tags where userid = '$USER' and name REGEXP '. . ';" | tail -n1` |
|---|
| 63 |
links=`mysql -u $u -p$p $u -B -e "select count( url ) as 'links' from links where userid = '$USER';" | tail -n1` |
|---|
| 64 |
tags=`mysql -u $u -p$p $u -B -e "select count( id ) as 'tags' from tags where userid = '$USER';" | tail -n1` |
|---|
| 65 |
last_ten_domains_linked=`mysql -u $u -p$p $u -B -e "select lower(substring_index(url, '.', 2)) from links where userid = '$USER' order by id desc limit ${scope};" | tail -n${scope}` |
|---|
| 66 |
has_been_snuffed=`mysql -u $u -p$p $u -B -e "select snuffed from users where userid = '$USER';" | tail -n1` |
|---|
| 67 |
# for freshness idx: |
|---|
| 68 |
first_link=`mysql -u $u -p$p $u -B -e "select to_days( createDate ) from links where userid = '$USER' ORDER BY id ASC LIMIT 1;" | tail -n1` |
|---|
| 69 |
last_link=`mysql -u $u -p$p $u -B -e "select to_days( createDate ) from links where userid = '$USER' ORDER BY id DESC LIMIT 1;" | tail -n1` |
|---|
| 70 |
|
|---|
| 71 |
if [ $links -ne 0 ] |
|---|
| 72 |
then |
|---|
| 73 |
|
|---|
| 74 |
output_xml_usr_start $USER |
|---|
| 75 |
|
|---|
| 76 |
echo "$USER" | grep -q "[0-9][0-9]" |
|---|
| 77 |
userid_w_double_digits=$? |
|---|
| 78 |
if [ $userid_w_double_digits -eq 1 ] |
|---|
| 79 |
then |
|---|
| 80 |
userid_w_double_digits_idx=0 |
|---|
| 81 |
else |
|---|
| 82 |
userid_w_double_digits_idx=1 |
|---|
| 83 |
fi |
|---|
| 84 |
output_xml_idx "userid_w_double_digits_idx" $userid_w_double_digits_idx |
|---|
| 85 |
|
|---|
| 86 |
let unique_domains=`echo $last_ten_domains_linked | xargs -n1 | sort | uniq | wc -l` |
|---|
| 87 |
domain_duplicity_idx=`echo "scale=4; 1-($unique_domains/$scope)" | bc` |
|---|
| 88 |
output_xml_idx "domain_duplicity_idx" $domain_duplicity_idx |
|---|
| 89 |
tags_to_links_idx=`echo "scale = 4; $tags / $links" | bc` |
|---|
| 90 |
output_xml_idx "tags_to_links_idx" $tags_to_links_idx |
|---|
| 91 |
|
|---|
| 92 |
double_hyphenated_link_idx=`echo "scale = 4; $double_hyphenated_links / $links" | bc` |
|---|
| 93 |
output_xml_idx "double_hyphenated_link_idx" $double_hyphenated_link_idx |
|---|
| 94 |
|
|---|
| 95 |
dot_info_domain_idx=`echo "scale = 4; $dot_info_domains / $links" | bc` |
|---|
| 96 |
output_xml_idx "dot_info_domain_idx" $dot_info_domain_idx |
|---|
| 97 |
|
|---|
| 98 |
known_spammer_domain_idx=`echo "scale = 4; $known_spammer_domain_links / $links" | bc` |
|---|
| 99 |
output_xml_idx "known_spammer_domain_idx" $known_spammer_domain_idx |
|---|
| 100 |
|
|---|
| 101 |
pheezys_links=11658 |
|---|
| 102 |
volume_idx=`echo "scale = 4; 1-($links / $pheezys_links)" | bc` |
|---|
| 103 |
output_xml_idx "volume_idx" $volume_idx |
|---|
| 104 |
|
|---|
| 105 |
if [ -z "$has_been_snuffed" -o "$has_been_snuffed" = "NULL" ] |
|---|
| 106 |
then |
|---|
| 107 |
has_been_snuffed=0 |
|---|
| 108 |
fi |
|---|
| 109 |
output_xml_idx "has_been_snuffed_idx" $has_been_snuffed |
|---|
| 110 |
|
|---|
| 111 |
if [ $tags -ne 0 ] |
|---|
| 112 |
then |
|---|
| 113 |
tags_w_commas_idx=`echo "scale = 4; $tags_w_commas / $tags" | bc` |
|---|
| 114 |
output_xml_idx "tags_w_commas_idx" $tags_w_commas_idx |
|---|
| 115 |
|
|---|
| 116 |
tags_w_multiple_spaces_idx=`echo "scale = 4; $tags_w_multiple_spaces / $tags" | bc` |
|---|
| 117 |
output_xml_idx "tags_w_multiple_spaces_idx" $tags_w_multiple_spaces_idx |
|---|
| 118 |
fi |
|---|
| 119 |
|
|---|
| 120 |
if [ -n "$first_link" -a -n "$last_link" ] |
|---|
| 121 |
then |
|---|
| 122 |
let age_in_days=$(($last_link-$first_link)) |
|---|
| 123 |
fml_start_days=1799 |
|---|
| 124 |
freshness_idx=`echo "scale = 4; 1-($age_in_days / $fml_start_days)" | bc` |
|---|
| 125 |
output_xml_idx "freshness_idx" $freshness_idx |
|---|
| 126 |
fi |
|---|
| 127 |
|
|---|
| 128 |
output_xml_usr_end |
|---|
| 129 |
fi # end if( links ) |
|---|
| 130 |
|
|---|
| 131 |
done |
|---|
| 132 |
|
|---|
| 133 |
output_xml_doc_end |
|---|
| 134 |
|
|---|