root/feedmelinks/bin/heuristics.sh

Revision 1339, 4.7 kB (checked in by jm3, 2 years ago)

cleanup

  • Property svn:executable set to *
Line 
1 #!/bin/sh
2 # TODO: check for minima before building indices
3
4 . env.sh
5
6 if [ "$#" -lt 2 ]
7 then
8         echo "usage: heuristics.sh (SPAMMERS | USERS) username [username] [username]..."
9         exit 1
10 fi
11
12 if [ "$1" != "SPAMMERS" -a "$1" != "USERS" ]
13 then
14         echo "usage: heuristics.sh (SPAMMERS | USERS) username [username] [username]..."
15         exit 2
16 fi
17
18 function output_xml_doc_start () {
19         echo "<?xml version='1.0'?>"
20         echo "<user_group type='$1'>"
21 }
22
23 function output_xml_doc_end () {
24         echo "</user_group>"
25 }
26
27 function output_xml_usr_start () {
28         echo "  <user name='$1'>"
29         echo "    <heuristics"
30 }
31
32 function output_xml_usr_end () {
33         echo "    />"
34         echo "  </user>"
35 }
36
37 function output_xml_idx () {
38         name=$1
39         value=$2
40         echo "      $name='$value'"
41 }
42
43 output_xml_doc_start $1
44 shift
45 USERS=$*
46
47 # db auth:
48 u=`get_database_user`
49 db=`get_database_name`
50 p=`get_database_password`
51
52 for USER in $USERS
53 do
54
55         # number of links to examine:
56         scope=10
57
58    double_hyphenated_links=`mysql -u $u -p$p $u -B -e "select count( url ) as 'double_hyphenated_links' from links where userid = '$USER' and url REGEXP '.+-.+-.+\\.com';" | tail -n1`
59 known_spammer_domain_links=`mysql -u $u -p$p $u -B -e "select count( url ) as 'known_spammer_domain_links' from links where userid = '$USER' and url REGEXP '(ourbigtop|blogspot)\.com';" | tail -n1`
60                 dot_info_domains=`mysql -u $u -p$p $u -B -e "select count( url ) as 'dot_info_domains' from links where userid = '$USER' and url REGEXP '\\.info(%2F|$)';" | tail -n1`
61                    tags_w_commas=`mysql -u $u -p$p $u -B -e "select count( name ) as 'tags_w_commas' from tags where userid = '$USER' and name REGEXP ',';" | tail -n1`
62     tags_w_multiple_spaces=`mysql -u $u -p$p $u -B -e "select count( name ) as 'tags_w_multiple_spaces' from tags where userid = '$USER' and name REGEXP '. . ';" | tail -n1`
63                      links=`mysql -u $u -p$p $u -B -e "select count( url ) as 'links' from links where userid = '$USER';" | tail -n1`
64                             tags=`mysql -u $u -p$p $u -B -e "select count( id ) as 'tags' from tags where userid = '$USER';" | tail -n1`
65    last_ten_domains_linked=`mysql -u $u -p$p $u -B -e "select lower(substring_index(url, '.', 2)) from links where userid = '$USER' order by id desc limit ${scope};" | tail -n${scope}`
66           has_been_snuffed=`mysql -u $u -p$p $u -B -e "select snuffed from users where userid = '$USER';" | tail -n1`
67       # for freshness idx:
68                 first_link=`mysql -u $u -p$p $u -B -e "select to_days( createDate ) from links where userid = '$USER' ORDER BY id ASC LIMIT 1;" | tail -n1`
69                        last_link=`mysql -u $u -p$p $u -B -e "select to_days( createDate ) from links where userid = '$USER' ORDER BY id DESC LIMIT 1;" | tail -n1`
70
71         if [ $links -ne 0 ]
72         then
73
74                 output_xml_usr_start $USER
75
76                 echo "$USER" | grep -q "[0-9][0-9]"
77                 userid_w_double_digits=$?
78                 if [ $userid_w_double_digits -eq 1 ]
79                 then
80                         userid_w_double_digits_idx=0
81                 else
82                         userid_w_double_digits_idx=1
83                 fi
84                 output_xml_idx "userid_w_double_digits_idx" $userid_w_double_digits_idx
85
86                 let unique_domains=`echo $last_ten_domains_linked | xargs -n1  | sort | uniq | wc -l`
87                 domain_duplicity_idx=`echo "scale=4; 1-($unique_domains/$scope)" | bc`
88                 output_xml_idx "domain_duplicity_idx" $domain_duplicity_idx
89                 tags_to_links_idx=`echo "scale = 4; $tags / $links"  | bc`
90                 output_xml_idx "tags_to_links_idx" $tags_to_links_idx
91
92                 double_hyphenated_link_idx=`echo "scale = 4; $double_hyphenated_links / $links" | bc`
93                 output_xml_idx "double_hyphenated_link_idx" $double_hyphenated_link_idx
94                
95                 dot_info_domain_idx=`echo "scale = 4; $dot_info_domains / $links"  | bc`
96                 output_xml_idx "dot_info_domain_idx" $dot_info_domain_idx
97                
98                 known_spammer_domain_idx=`echo "scale = 4; $known_spammer_domain_links / $links"  | bc`
99                 output_xml_idx "known_spammer_domain_idx" $known_spammer_domain_idx
100
101                 pheezys_links=11658
102                 volume_idx=`echo "scale = 4; 1-($links / $pheezys_links)" | bc`
103                 output_xml_idx "volume_idx" $volume_idx
104                
105                 if [ -z "$has_been_snuffed" -o "$has_been_snuffed" = "NULL" ]
106                 then
107                         has_been_snuffed=0
108                 fi
109                 output_xml_idx "has_been_snuffed_idx" $has_been_snuffed
110
111                 if [ $tags -ne 0 ]
112                 then
113                         tags_w_commas_idx=`echo "scale = 4; $tags_w_commas / $tags" | bc`
114                         output_xml_idx "tags_w_commas_idx" $tags_w_commas_idx
115
116                         tags_w_multiple_spaces_idx=`echo "scale = 4; $tags_w_multiple_spaces / $tags" | bc`
117                         output_xml_idx "tags_w_multiple_spaces_idx" $tags_w_multiple_spaces_idx
118                 fi
119
120                 if [ -n "$first_link" -a -n "$last_link" ]
121                 then
122                         let age_in_days=$(($last_link-$first_link))
123                         fml_start_days=1799
124                         freshness_idx=`echo "scale = 4; 1-($age_in_days / $fml_start_days)" | bc`
125                         output_xml_idx "freshness_idx" $freshness_idx
126                 fi
127
128         output_xml_usr_end
129         fi # end if( links )
130
131 done
132
133 output_xml_doc_end
134
Note: See TracBrowser for help on using the browser.