| 1 |
<?php |
|---|
| 2 |
|
|---|
| 3 |
/** |
|---|
| 4 |
* Project: MagpieRSS: a simple RSS integration tool |
|---|
| 5 |
* File: rss_parse.inc - parse an RSS or Atom feed |
|---|
| 6 |
* return as a simple object. |
|---|
| 7 |
* |
|---|
| 8 |
* Handles RSS 0.9x, RSS 2.0, RSS 1.0, and Atom 0.3 |
|---|
| 9 |
* |
|---|
| 10 |
* The lastest version of MagpieRSS can be obtained from: |
|---|
| 11 |
* http://magpierss.sourceforge.net |
|---|
| 12 |
* |
|---|
| 13 |
* For questions, help, comments, discussion, etc., please join the |
|---|
| 14 |
* Magpie mailing list: |
|---|
| 15 |
* magpierss-general@lists.sourceforge.net |
|---|
| 16 |
* |
|---|
| 17 |
* @author Kellan Elliott-McCrea <kellan@protest.net> |
|---|
| 18 |
* @version 0.7a |
|---|
| 19 |
* @license GPL |
|---|
| 20 |
* |
|---|
| 21 |
*/ |
|---|
| 22 |
|
|---|
| 23 |
define('RSS', 'RSS'); |
|---|
| 24 |
define('ATOM', 'Atom'); |
|---|
| 25 |
|
|---|
| 26 |
require_once (MAGPIE_DIR . 'rss_utils.inc'); |
|---|
| 27 |
|
|---|
| 28 |
/** |
|---|
| 29 |
* Hybrid parser, and object, takes RSS as a string and returns a simple object. |
|---|
| 30 |
* |
|---|
| 31 |
* see: rss_fetch.inc for a simpler interface with integrated caching support |
|---|
| 32 |
* |
|---|
| 33 |
*/ |
|---|
| 34 |
class MagpieRSS { |
|---|
| 35 |
var $parser; |
|---|
| 36 |
|
|---|
| 37 |
var $current_item = array(); // item currently being parsed |
|---|
| 38 |
var $items = array(); // collection of parsed items |
|---|
| 39 |
var $channel = array(); // hash of channel fields |
|---|
| 40 |
var $textinput = array(); |
|---|
| 41 |
var $image = array(); |
|---|
| 42 |
var $feed_type; |
|---|
| 43 |
var $feed_version; |
|---|
| 44 |
var $encoding = ''; // output encoding of parsed rss |
|---|
| 45 |
|
|---|
| 46 |
var $_source_encoding = ''; // only set if we have to parse xml prolog |
|---|
| 47 |
|
|---|
| 48 |
var $ERROR = ""; |
|---|
| 49 |
var $WARNING = ""; |
|---|
| 50 |
|
|---|
| 51 |
// define some constants |
|---|
| 52 |
|
|---|
| 53 |
var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright'); |
|---|
| 54 |
var $_KNOWN_ENCODINGS = array('UTF-8', 'US-ASCII', 'ISO-8859-1'); |
|---|
| 55 |
|
|---|
| 56 |
// parser variables, useless if you're not a parser, treat as private |
|---|
| 57 |
var $stack = array(); // parser stack |
|---|
| 58 |
var $inchannel = false; |
|---|
| 59 |
var $initem = false; |
|---|
| 60 |
var $incontent = false; // if in Atom <content mode="xml"> field |
|---|
| 61 |
var $intextinput = false; |
|---|
| 62 |
var $inimage = false; |
|---|
| 63 |
var $current_namespace = false; |
|---|
| 64 |
|
|---|
| 65 |
|
|---|
| 66 |
/** |
|---|
| 67 |
* Set up XML parser, parse source, and return populated RSS object.. |
|---|
| 68 |
* |
|---|
| 69 |
* @param string $source string containing the RSS to be parsed |
|---|
| 70 |
* |
|---|
| 71 |
* NOTE: Probably a good idea to leave the encoding options alone unless |
|---|
| 72 |
* you know what you're doing as PHP's character set support is |
|---|
| 73 |
* a little weird. |
|---|
| 74 |
* |
|---|
| 75 |
* NOTE: A lot of this is unnecessary but harmless with PHP5 |
|---|
| 76 |
* |
|---|
| 77 |
* |
|---|
| 78 |
* @param string $output_encoding output the parsed RSS in this character |
|---|
| 79 |
* set defaults to ISO-8859-1 as this is PHP's |
|---|
| 80 |
* default. |
|---|
| 81 |
* |
|---|
| 82 |
* NOTE: might be changed to UTF-8 in future |
|---|
| 83 |
* versions. |
|---|
| 84 |
* |
|---|
| 85 |
* @param string $input_encoding the character set of the incoming RSS source. |
|---|
| 86 |
* Leave blank and Magpie will try to figure it |
|---|
| 87 |
* out. |
|---|
| 88 |
* |
|---|
| 89 |
* |
|---|
| 90 |
* @param bool $detect_encoding if false Magpie won't attempt to detect |
|---|
| 91 |
* source encoding. (caveat emptor) |
|---|
| 92 |
* |
|---|
| 93 |
*/ |
|---|
| 94 |
function MagpieRSS ($source, $output_encoding='ISO-8859-1', |
|---|
| 95 |
$input_encoding=null, $detect_encoding=true) |
|---|
| 96 |
{ |
|---|
| 97 |
# if PHP xml isn't compiled in, die |
|---|
| 98 |
# |
|---|
| 99 |
if (!function_exists('xml_parser_create')) { |
|---|
| 100 |
$this->error( "Failed to load PHP's XML Extension. " . |
|---|
| 101 |
"http://www.php.net/manual/en/ref.xml.php", |
|---|
| 102 |
E_USER_ERROR ); |
|---|
| 103 |
} |
|---|
| 104 |
|
|---|
| 105 |
list($parser, $source) = $this->create_parser($source, |
|---|
| 106 |
$output_encoding, $input_encoding, $detect_encoding); |
|---|
| 107 |
|
|---|
| 108 |
|
|---|
| 109 |
if (!is_resource($parser)) { |
|---|
| 110 |
$this->error( "Failed to create an instance of PHP's XML parser. " . |
|---|
| 111 |
"http://www.php.net/manual/en/ref.xml.php", |
|---|
| 112 |
E_USER_ERROR ); |
|---|
| 113 |
} |
|---|
| 114 |
|
|---|
| 115 |
|
|---|
| 116 |
$this->parser = $parser; |
|---|
| 117 |
|
|---|
| 118 |
# pass in parser, and a reference to this object |
|---|
| 119 |
# setup handlers |
|---|
| 120 |
# |
|---|
| 121 |
xml_set_object( $this->parser, $this ); |
|---|
| 122 |
xml_set_element_handler($this->parser, |
|---|
| 123 |
'feed_start_element', 'feed_end_element' ); |
|---|
| 124 |
|
|---|
| 125 |
xml_set_character_data_handler( $this->parser, 'feed_cdata' ); |
|---|
| 126 |
|
|---|
| 127 |
$status = xml_parse( $this->parser, $source ); |
|---|
| 128 |
|
|---|
| 129 |
if (! $status ) { |
|---|
| 130 |
$errorcode = xml_get_error_code( $this->parser ); |
|---|
| 131 |
if ( $errorcode != XML_ERROR_NONE ) { |
|---|
| 132 |
$xml_error = xml_error_string( $errorcode ); |
|---|
| 133 |
$error_line = xml_get_current_line_number($this->parser); |
|---|
| 134 |
$error_col = xml_get_current_column_number($this->parser); |
|---|
| 135 |
$errormsg = "$xml_error at line $error_line, column $error_col"; |
|---|
| 136 |
|
|---|
| 137 |
$this->error( $errormsg ); |
|---|
| 138 |
} |
|---|
| 139 |
} |
|---|
| 140 |
|
|---|
| 141 |
xml_parser_free( $this->parser ); |
|---|
| 142 |
|
|---|
| 143 |
$this->normalize(); |
|---|
| 144 |
} |
|---|
| 145 |
|
|---|
| 146 |
function feed_start_element($p, $element, &$attrs) { |
|---|
| 147 |
$el = $element = strtolower($element); |
|---|
| 148 |
$attrs = array_change_key_case($attrs, CASE_LOWER); |
|---|
| 149 |
|
|---|
| 150 |
// check for a namespace, and split if found |
|---|
| 151 |
$ns = false; |
|---|
| 152 |
if ( strpos( $element, ':' ) ) { |
|---|
| 153 |
list($ns, $el) = split( ':', $element, 2); |
|---|
| 154 |
} |
|---|
| 155 |
if ( $ns and $ns != 'rdf' ) { |
|---|
| 156 |
$this->current_namespace = $ns; |
|---|
| 157 |
} |
|---|
| 158 |
|
|---|
| 159 |
# if feed type isn't set, then this is first element of feed |
|---|
| 160 |
# identify feed from root element |
|---|
| 161 |
# |
|---|
| 162 |
if (!isset($this->feed_type) ) { |
|---|
| 163 |
if ( $el == 'rdf' ) { |
|---|
| 164 |
$this->feed_type = RSS; |
|---|
| 165 |
$this->feed_version = '1.0'; |
|---|
| 166 |
} |
|---|
| 167 |
elseif ( $el == 'rss' ) { |
|---|
| 168 |
$this->feed_type = RSS; |
|---|
| 169 |
$this->feed_version = $attrs['version']; |
|---|
| 170 |
} |
|---|
| 171 |
elseif ( $el == 'feed' ) { |
|---|
| 172 |
$this->feed_type = ATOM; |
|---|
| 173 |
$this->feed_version = $attrs['version']; |
|---|
| 174 |
$this->inchannel = true; |
|---|
| 175 |
} |
|---|
| 176 |
return; |
|---|
| 177 |
} |
|---|
| 178 |
|
|---|
| 179 |
if ( $el == 'channel' ) |
|---|
| 180 |
{ |
|---|
| 181 |
$this->inchannel = true; |
|---|
| 182 |
} |
|---|
| 183 |
elseif ($el == 'item' or $el == 'entry' ) |
|---|
| 184 |
{ |
|---|
| 185 |
$this->initem = true; |
|---|
| 186 |
if ( isset($attrs['rdf:about']) ) { |
|---|
| 187 |
$this->current_item['about'] = $attrs['rdf:about']; |
|---|
| 188 |
} |
|---|
| 189 |
} |
|---|
| 190 |
|
|---|
| 191 |
// if we're in the default namespace of an RSS feed, |
|---|
| 192 |
// record textinput or image fields |
|---|
| 193 |
elseif ( |
|---|
| 194 |
$this->feed_type == RSS and |
|---|
| 195 |
$this->current_namespace == '' and |
|---|
| 196 |
$el == 'textinput' ) |
|---|
| 197 |
{ |
|---|
| 198 |
$this->intextinput = true; |
|---|
| 199 |
} |
|---|
| 200 |
|
|---|
| 201 |
elseif ( |
|---|
| 202 |
$this->feed_type == RSS and |
|---|
| 203 |
$this->current_namespace == '' and |
|---|
| 204 |
$el == 'image' ) |
|---|
| 205 |
{ |
|---|
| 206 |
$this->inimage = true; |
|---|
| 207 |
} |
|---|
| 208 |
|
|---|
| 209 |
# handle atom content constructs |
|---|
| 210 |
elseif ( $this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) ) |
|---|
| 211 |
{ |
|---|
| 212 |
// avoid clashing w/ RSS mod_content |
|---|
| 213 |
if ($el == 'content' ) { |
|---|
| 214 |
$el = 'atom_content'; |
|---|
| 215 |
} |
|---|
| 216 |
|
|---|
| 217 |
$this->incontent = $el; |
|---|
| 218 |
|
|---|
| 219 |
|
|---|
| 220 |
} |
|---|
| 221 |
|
|---|
| 222 |
// if inside an Atom content construct (e.g. content or summary) field treat tags as text |
|---|
| 223 |
elseif ($this->feed_type == ATOM and $this->incontent ) |
|---|
| 224 |
{ |
|---|
| 225 |
// if tags are inlined, then flatten |
|---|
| 226 |
$attrs_str = join(' ', |
|---|
| 227 |
array_map('map_attrs', |
|---|
| 228 |
array_keys($attrs), |
|---|
| 229 |
array_values($attrs) ) ); |
|---|
| 230 |
|
|---|
| 231 |
$this->append_content( "<$element $attrs_str>" ); |
|---|
| 232 |
|
|---|
| 233 |
array_unshift( $this->stack, $el ); |
|---|
| 234 |
} |
|---|
| 235 |
|
|---|
| 236 |
// Atom support many links per containging element. |
|---|
| 237 |
// Magpie treats link elements of type rel='alternate' |
|---|
| 238 |
// as being equivalent to RSS's simple link element. |
|---|
| 239 |
// |
|---|
| 240 |
elseif ($this->feed_type == ATOM and $el == 'link' ) |
|---|
| 241 |
{ |
|---|
| 242 |
if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' ) |
|---|
| 243 |
{ |
|---|
| 244 |
$link_el = 'link'; |
|---|
| 245 |
} |
|---|
| 246 |
else { |
|---|
| 247 |
$link_el = 'link_' . $attrs['rel']; |
|---|
| 248 |
} |
|---|
| 249 |
|
|---|
| 250 |
$this->append($link_el, $attrs['href']); |
|---|
| 251 |
} |
|---|
| 252 |
// set stack[0] to current element |
|---|
| 253 |
else { |
|---|
| 254 |
array_unshift($this->stack, $el); |
|---|
| 255 |
} |
|---|
| 256 |
} |
|---|
| 257 |
|
|---|
| 258 |
|
|---|
| 259 |
|
|---|
| 260 |
function feed_cdata ($p, $text) { |
|---|
| 261 |
if ($this->feed_type == ATOM and $this->incontent) |
|---|
| 262 |
{ |
|---|
| 263 |
$this->append_content( $text ); |
|---|
| 264 |
} |
|---|
| 265 |
else { |
|---|
| 266 |
$current_el = join('_', array_reverse($this->stack)); |
|---|
| 267 |
$this->append($current_el, $text); |
|---|
| 268 |
} |
|---|
| 269 |
} |
|---|
| 270 |
|
|---|
| 271 |
function feed_end_element ($p, $el) { |
|---|
| 272 |
$el = strtolower($el); |
|---|
| 273 |
|
|---|
| 274 |
if ( $el == 'item' or $el == 'entry' ) |
|---|
| 275 |
{ |
|---|
| 276 |
$this->items[] = $this->current_item; |
|---|
| 277 |
$this->current_item = array(); |
|---|
| 278 |
$this->initem = false; |
|---|
| 279 |
} |
|---|
| 280 |
elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'textinput' ) |
|---|
| 281 |
{ |
|---|
| 282 |
$this->intextinput = false; |
|---|
| 283 |
} |
|---|
| 284 |
elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'image' ) |
|---|
| 285 |
{ |
|---|
| 286 |
$this->inimage = false; |
|---|
| 287 |
} |
|---|
| 288 |
elseif ($this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) ) |
|---|
| 289 |
{ |
|---|
| 290 |
$this->incontent = false; |
|---|
| 291 |
} |
|---|
| 292 |
elseif ($el == 'channel' or $el == 'feed' ) |
|---|
| 293 |
{ |
|---|
| 294 |
$this->inchannel = false; |
|---|
| 295 |
} |
|---|
| 296 |
elseif ($this->feed_type == ATOM and $this->incontent ) { |
|---|
| 297 |
// balance tags properly |
|---|
| 298 |
// note: i don't think this is actually neccessary |
|---|
| 299 |
if ( $this->stack[0] == $el ) |
|---|
| 300 |
{ |
|---|
| 301 |
$this->append_content("</$el>"); |
|---|
| 302 |
} |
|---|
| 303 |
else { |
|---|
| 304 |
$this->append_content("<$el />"); |
|---|
| 305 |
} |
|---|
| 306 |
|
|---|
| 307 |
array_shift( $this->stack ); |
|---|
| 308 |
} |
|---|
| 309 |
else { |
|---|
| 310 |
array_shift( $this->stack ); |
|---|
| 311 |
} |
|---|
| 312 |
|
|---|
| 313 |
$this->current_namespace = false; |
|---|
| 314 |
} |
|---|
| 315 |
|
|---|
| 316 |
function concat (&$str1, $str2="") { |
|---|
| 317 |
if (!isset($str1) ) { |
|---|
| 318 |
$str1=""; |
|---|
| 319 |
} |
|---|
| 320 |
$str1 .= $str2; |
|---|
| 321 |
} |
|---|
| 322 |
|
|---|
| 323 |
|
|---|
| 324 |
|
|---|
| 325 |
function append_content($text) { |
|---|
| 326 |
if ( $this->initem ) { |
|---|
| 327 |
$this->concat( $this->current_item[ $this->incontent ], $text ); |
|---|
| 328 |
} |
|---|
| 329 |
elseif ( $this->inchannel ) { |
|---|
| 330 |
$this->concat( $this->channel[ $this->incontent ], $text ); |
|---|
| 331 |
} |
|---|
| 332 |
} |
|---|
| 333 |
|
|---|
| 334 |
// smart append - field and namespace aware |
|---|
| 335 |
function append($el, $text) { |
|---|
| 336 |
if (!$el) { |
|---|
| 337 |
return; |
|---|
| 338 |
} |
|---|
| 339 |
if ( $this->current_namespace ) |
|---|
| 340 |
{ |
|---|
| 341 |
if ( $this->initem ) { |
|---|
| 342 |
$this->concat( |
|---|
| 343 |
$this->current_item[ $this->current_namespace ][ $el ], $text); |
|---|
| 344 |
} |
|---|
| 345 |
elseif ($this->inchannel) { |
|---|
| 346 |
$this->concat( |
|---|
| 347 |
$this->channel[ $this->current_namespace][ $el ], $text ); |
|---|
| 348 |
} |
|---|
| 349 |
elseif ($this->intextinput) { |
|---|
| 350 |
$this->concat( |
|---|
| 351 |
$this->textinput[ $this->current_namespace][ $el ], $text ); |
|---|
| 352 |
} |
|---|
| 353 |
elseif ($this->inimage) { |
|---|
| 354 |
$this->concat( |
|---|
| 355 |
$this->image[ $this->current_namespace ][ $el ], $text ); |
|---|
| 356 |
} |
|---|
| 357 |
} |
|---|
| 358 |
else { |
|---|
| 359 |
if ( $this->initem ) { |
|---|
| 360 |
$this->concat( |
|---|
| 361 |
$this->current_item[ $el ], $text); |
|---|
| 362 |
} |
|---|
| 363 |
elseif ($this->intextinput) { |
|---|
| 364 |
$this->concat( |
|---|
| 365 |
$this->textinput[ $el ], $text ); |
|---|
| 366 |
} |
|---|
| 367 |
elseif ($this->inimage) { |
|---|
| 368 |
$this->concat( |
|---|
| 369 |
$this->image[ $el ], $text ); |
|---|
| 370 |
} |
|---|
| 371 |
elseif ($this->inchannel) { |
|---|
| 372 |
$this->concat( |
|---|
| 373 |
$this->channel[ $el ], $text ); |
|---|
| 374 |
} |
|---|
| 375 |
|
|---|
| 376 |
} |
|---|
| 377 |
} |
|---|
| 378 |
|
|---|
| 379 |
function normalize () { |
|---|
| 380 |
// if atom populate rss fields |
|---|
| 381 |
if ( $this->is_atom() ) { |
|---|
| 382 |
$this->channel['description'] = $this->channel['tagline']; |
|---|
| 383 |
for ( $i = 0; $i < count($this->items); $i++) { |
|---|
| 384 |
$item = $this->items[$i]; |
|---|
| 385 |
if ( isset($item['summary']) ) |
|---|
| 386 |
$item['description'] = $item['summary']; |
|---|
| 387 |
if ( isset($item['atom_content'])) |
|---|
| 388 |
$item['content']['encoded'] = $item['atom_content']; |
|---|
| 389 |
|
|---|
| 390 |
$atom_date = (isset($item['issued']) ) ? $item['issued'] : $item['modified']; |
|---|
| 391 |
if ( $atom_date ) { |
|---|
| 392 |
$epoch = @parse_w3cdtf($atom_date); |
|---|
| 393 |
if ($epoch and $epoch > 0) { |
|---|
| 394 |
$item['date_timestamp'] = $epoch; |
|---|
| 395 |
} |
|---|
| 396 |
} |
|---|
| 397 |
|
|---|
| 398 |
$this->items[$i] = $item; |
|---|
| 399 |
} |
|---|
| 400 |
} |
|---|
| 401 |
elseif ( $this->is_rss() ) { |
|---|
| 402 |
$this->channel['tagline'] = $this->channel['description']; |
|---|
| 403 |
for ( $i = 0; $i < count($this->items); $i++) { |
|---|
| 404 |
$item = $this->items[$i]; |
|---|
| 405 |
if ( isset($item['description'])) |
|---|
| 406 |
$item['summary'] = $item['description']; |
|---|
| 407 |
if ( isset($item['content']['encoded'] ) ) |
|---|
| 408 |
$item['atom_content'] = $item['content']['encoded']; |
|---|
| 409 |
|
|---|
| 410 |
if ( $this->is_rss() == '1.0' and isset($item['dc']['date']) ) { |
|---|
| 411 |
$epoch = @parse_w3cdtf($item['dc']['date']); |
|---|
| 412 |
if ($epoch and $epoch > 0) { |
|---|
| 413 |
$item['date_timestamp'] = $epoch; |
|---|
| 414 |
} |
|---|
| 415 |
} |
|---|
| 416 |
elseif ( isset($item['pubdate']) ) { |
|---|
| 417 |
$epoch = @strtotime($item['pubdate']); |
|---|
| 418 |
if ($epoch > 0) { |
|---|
| 419 |
$item['date_timestamp'] = $epoch; |
|---|
| 420 |
} |
|---|
| 421 |
} |
|---|
| 422 |
|
|---|
| 423 |
$this->items[$i] = $item; |
|---|
| 424 |
} |
|---|
| 425 |
} |
|---|
| 426 |
} |
|---|
| 427 |
|
|---|
| 428 |
|
|---|
| 429 |
function is_rss () { |
|---|
| 430 |
if ( $this->feed_type == RSS ) { |
|---|
| 431 |
return $this->feed_version; |
|---|
| 432 |
} |
|---|
| 433 |
else { |
|---|
| 434 |
return false; |
|---|
| 435 |
} |
|---|
| 436 |
} |
|---|
| 437 |
|
|---|
| 438 |
function is_atom() { |
|---|
| 439 |
if ( $this->feed_type == ATOM ) { |
|---|
| 440 |
return $this->feed_version; |
|---|
| 441 |
} |
|---|
| 442 |
else { |
|---|
| 443 |
return false; |
|---|
| 444 |
} |
|---|
| 445 |
} |
|---|
| 446 |
|
|---|
| 447 |
/** |
|---|
| 448 |
* return XML parser, and possibly re-encoded source |
|---|
| 449 |
* |
|---|
| 450 |
*/ |
|---|
| 451 |
function create_parser($source, $out_enc, $in_enc, $detect) { |
|---|
| 452 |
if ( substr(phpversion(),0,1) == 5) { |
|---|
| 453 |
$parser = $this->php5_create_parser($in_enc, $detect); |
|---|
| 454 |
} |
|---|
| 455 |
else { |
|---|
| 456 |
list($parser, $source) = $this->php4_create_parser($source, $in_enc, $detect); |
|---|
| 457 |
} |
|---|
| 458 |
if ($out_enc) { |
|---|
| 459 |
$this->encoding = $out_enc; |
|---|
| 460 |
xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $out_enc); |
|---|
| 461 |
} |
|---|
| 462 |
|
|---|
| 463 |
return array($parser, $source); |
|---|
| 464 |
} |
|---|
| 465 |
|
|---|
| 466 |
/** |
|---|
| 467 |
* Instantiate an XML parser under PHP5 |
|---|
| 468 |
* |
|---|
| 469 |
* PHP5 will do a fine job of detecting input encoding |
|---|
| 470 |
* if passed an empty string as the encoding. |
|---|
| 471 |
* |
|---|
| 472 |
* All hail libxml2! |
|---|
| 473 |
* |
|---|
| 474 |
*/ |
|---|
| 475 |
function php5_create_parser($in_enc, $detect) { |
|---|
| 476 |
// by default php5 does a fine job of detecting input encodings |
|---|
| 477 |
if(!$detect && $in_enc) { |
|---|
| 478 |
return xml_parser_create($in_enc); |
|---|
| 479 |
} |
|---|
| 480 |
else { |
|---|
| 481 |
return xml_parser_create(''); |
|---|
| 482 |
} |
|---|
| 483 |
} |
|---|
| 484 |
|
|---|
| 485 |
/** |
|---|
| 486 |
* Instaniate an XML parser under PHP4 |
|---|
| 487 |
* |
|---|
| 488 |
* Unfortunately PHP4's support for character encodings |
|---|
| 489 |
* and especially XML and character encodings sucks. As |
|---|
| 490 |
* long as the documents you parse only contain characters |
|---|
| 491 |
* from the ISO-8859-1 character set (a superset of ASCII, |
|---|
| 492 |
* and a subset of UTF-8) you're fine. However once you |
|---|
| 493 |
* step out of that comfy little world things get mad, bad, |
|---|
| 494 |
* and dangerous to know. |
|---|
| 495 |
* |
|---|
| 496 |
* The following code is based on SJM's work with FoF |
|---|
| 497 |
* @see http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss |
|---|
| 498 |
* |
|---|
| 499 |
*/ |
|---|
| 500 |
function php4_create_parser($source, $in_enc, $detect) { |
|---|
| 501 |
if ( !$detect ) { |
|---|
| 502 |
return array(xml_parser_create($in_enc), $source); |
|---|
| 503 |
} |
|---|
| 504 |
|
|---|
| 505 |
if (!$in_enc) { |
|---|
| 506 |
if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $source, $m)) { |
|---|
| 507 |
$in_enc = strtoupper($m[1]); |
|---|
| 508 |
$this->source_encoding = $in_enc; |
|---|
| 509 |
} |
|---|
| 510 |
else { |
|---|
| 511 |
$in_enc = 'UTF-8'; |
|---|
| 512 |
} |
|---|
| 513 |
} |
|---|
| 514 |
|
|---|
| 515 |
if ($this->known_encoding($in_enc)) { |
|---|
| 516 |
return array(xml_parser_create($in_enc), $source); |
|---|
| 517 |
} |
|---|
| 518 |
|
|---|
| 519 |
// the dectected encoding is not one of the simple encodings PHP knows |
|---|
| 520 |
|
|---|
| 521 |
// attempt to use the iconv extension to |
|---|
| 522 |
// cast the XML to a known encoding |
|---|
| 523 |
// @see http://php.net/iconv |
|---|
| 524 |
|
|---|
| 525 |
if (function_exists('iconv')) { |
|---|
| 526 |
$encoded_source = iconv($in_enc,'UTF-8', $source); |
|---|
| 527 |
if ($encoded_source) { |
|---|
| 528 |
return array(xml_parser_create('UTF-8'), $encoded_source); |
|---|
| 529 |
} |
|---|
| 530 |
} |
|---|
| 531 |
|
|---|
| 532 |
// iconv didn't work, try mb_convert_encoding |
|---|
| 533 |
// @see http://php.net/mbstring |
|---|
| 534 |
if(function_exists('mb_convert_encoding')) { |
|---|
| 535 |
$encoded_source = mb_convert_encoding($source, 'UTF-8', $in_enc ); |
|---|
| 536 |
if ($encoded_source) { |
|---|
| 537 |
return array(xml_parser_create('UTF-8'), $encoded_source); |
|---|
| 538 |
} |
|---|
| 539 |
} |
|---|
| 540 |
|
|---|
| 541 |
// else |
|---|
| 542 |
$this->error("Feed is in an unsupported character encoding. ($in_enc) " . |
|---|
| 543 |
"You may see strange artifacts, and mangled characters.", |
|---|
| 544 |
E_USER_NOTICE); |
|---|
| 545 |
|
|---|
| 546 |
return array(xml_parser_create(), $source); |
|---|
| 547 |
} |
|---|
| 548 |
|
|---|
| 549 |
function known_encoding($enc) { |
|---|
| 550 |
$enc = strtoupper($enc); |
|---|
| 551 |
if ( in_array($enc, $this->_KNOWN_ENCODINGS) ) { |
|---|
| 552 |
return $enc; |
|---|
| 553 |
} |
|---|
| 554 |
else { |
|---|
| 555 |
return false; |
|---|
| 556 |
} |
|---|
| 557 |
} |
|---|
| 558 |
|
|---|
| 559 |
function error ($errormsg, $lvl=E_USER_WARNING) { |
|---|
| 560 |
// append PHP's error message if track_errors enabled |
|---|
| 561 |
if ( isset($php_errormsg) ) { |
|---|
| 562 |
$errormsg .= " ($php_errormsg)"; |
|---|
| 563 |
} |
|---|
| 564 |
if ( MAGPIE_DEBUG ) { |
|---|
| 565 |
trigger_error( $errormsg, $lvl); |
|---|
| 566 |
} |
|---|
| 567 |
else { |
|---|
| 568 |
error_log( $errormsg, 0); |
|---|
| 569 |
} |
|---|
| 570 |
|
|---|
| 571 |
$notices = E_USER_NOTICE|E_NOTICE; |
|---|
| 572 |
if ( $lvl&$notices ) { |
|---|
| 573 |
$this->WARNING = $errormsg; |
|---|
| 574 |
} else { |
|---|
| 575 |
$this->ERROR = $errormsg; |
|---|
| 576 |
} |
|---|
| 577 |
} |
|---|
| 578 |
|
|---|
| 579 |
|
|---|
| 580 |
} // end class RSS |
|---|
| 581 |
|
|---|
| 582 |
function map_attrs($k, $v) { |
|---|
| 583 |
return "$k=\"$v\""; |
|---|
| 584 |
} |
|---|
| 585 |
|
|---|
| 586 |
// patch to support medieval versions of PHP4.1.x, |
|---|
| 587 |
// courtesy, Ryan Currie, ryan@digibliss.com |
|---|
| 588 |
|
|---|
| 589 |
if (!function_exists('array_change_key_case')) { |
|---|
| 590 |
define("CASE_UPPER",1); |
|---|
| 591 |
define("CASE_LOWER",0); |
|---|
| 592 |
|
|---|
| 593 |
|
|---|
| 594 |
function array_change_key_case($array,$case=CASE_LOWER) { |
|---|
| 595 |
if ($case=CASE_LOWER) $cmd=strtolower; |
|---|
| 596 |
elseif ($case=CASE_UPPER) $cmd=strtoupper; |
|---|
| 597 |
foreach($array as $key=>$value) { |
|---|
| 598 |
$output[$cmd($key)]=$value; |
|---|
| 599 |
} |
|---|
| 600 |
return $output; |
|---|
| 601 |
} |
|---|
| 602 |
|
|---|
| 603 |
} |
|---|
| 604 |
|
|---|
| 605 |
?> |
|---|