root/feedmelinks/bin/hybrid-to-xml.pl

Revision 1425, 2.2 kB (checked in by jm3, 2 years ago)

gulp. svn diff for the deltas

Line 
1 #!/usr/local/bin/perl
2
3 # convert from the delicious hybrid export format to valid XML
4
5 my $d = $ARGV[0];
6
7 print "<?xml version='1.0' standalone='yes'?>
8 <posts update='2007-02-27T15:32:34Z' user='fuck_compatibility'>
9 ";
10
11 #format is DT link info, then DD link comment
12
13
14 $comment_delim = "__FML_COMMENT__";
15
16 $header = 1;
17
18 $last_link = "";
19 while( <> ) {
20   if( $header ) { next unless ( /<DL>/ ); }
21         $line = $_;
22   $header = 0;
23   next unless ( $line =~ /<(DT|DD)>/ );
24   chomp( $line );
25
26         # link info comes first...
27         if( $line =~ /<DT>/ ) {
28                 $line =~ s/^<DT>//;
29                 $line =~ s/ LAST_VISIT="\d+" //;
30                 $line =~ s/ADD_DATE="\d+"//;
31                 $line =~ s/<A/  <post/;
32                 $line =~ s/A>/post>/;
33                 $line =~ s/HREF="/href="/;
34                 $line =~ s/TAGS="/tag="/;
35                 $line =~ s/>([^<]+)<.*$/\/>/;
36
37                 $name = $1;
38                 $name = escape_string( $name );
39
40                 # PLACEHOLDER TO REPLACE WITH REAL COMMENT IN NEXT BLOCK
41                 $name .= $comment_delim;
42
43                 $clean_modulo_tags = $line;
44                 $clean_modulo_tags =~ s/tag="[^"]+"/tag=""/;
45                 $tags = $line;
46                 $tags =~ s/tag="([^"]+)"/$1/;
47                 $tags = $1;
48                 $tags =~ s/,/ /g;
49                 $clean_modulo_tags =~ s/tag=""/tag="$tags"/;
50                 $clean_modulo_tags =~ s/" /" description="$name" /;
51                
52                 $href = $clean_modulo_tags;
53                 $clean_modulo_tags =~ s/href="([^"]+)"//; #delete the old href="..."
54                 $href = make_safe_url( $1 ); # clean it
55                 $clean_modulo_tags =~ s/description="/href="$href" description="/; #stuff it back in
56                 $last_link = $clean_modulo_tags;
57                
58                 # link COMMENTS come next:
59         } else {
60                 $comment = $line;
61                 $comment =~ s/^<DD>//;
62                 $comment = escape_string( $comment );
63                 $last_link =~ s/$comment_delim/$comment_delim$comment/;
64                 print "$last_link\n";
65         }
66 }
67
68 print "\n</posts>\n";
69
70 # for name and comments -- NOT url encoding
71 sub escape_string {
72         my $s = $_[0];
73         #print "name is now $s\n";
74         $s =~ s/&([^a][^m][^p][^;])/&amp;$1/g;
75         $s =~ s/&([^a])/&amp;$1/g;
76         $s =~ s/"/&quot;/g;
77
78         # escape all but these chars:
79         $s =~ s/([^,\]\[\)\(%A-Za-z! \/\\|0-9'&;:._\?=-])/sprintf("%%%02X", ord($1))/seg;
80
81         $s =~ s/%C2%AB/&#187;/g;
82         return $s;
83 }
84
85 sub make_safe_url {
86         my $url = $_[0];
87         my @bits = split /\?/, $url;
88         if( scalar( @bits ) == 2 ) {
89                 $bits[1] =~ s/([^A-Za-z0-9\?=])/sprintf("%%%02X", ord($1))/seg;
90                 return $bits[0] . $bits[1];
91         } else {
92                 return $url;
93         }
94 }
Note: See TracBrowser for help on using the browser.