friendica/include/Scrape.php

<?php

require_once('library/HTML5/Parser.php');

if(! function_exists('scrape_dfrn')) {
function scrape_dfrn($url) {

	$a = get_app();

	$ret = array();

	logger('scrape_dfrn: url=' . $url);

	$s = fetch_url($url);

	if(! $s) 
		return $ret;

	$headers = $a->get_curl_headers();
	logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);


	$lines = explode("\n",$headers);
	if(count($lines)) {
		foreach($lines as $line) {				
			// don't try and run feeds through the html5 parser
			if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
				return ret;
		}
	}


	$dom = HTML5_Parser::parse($s);

	if(! $dom)
		return $ret;

	$items = $dom->getElementsByTagName('link');

	// get DFRN link elements

	foreach($items as $item) {
		$x = $item->getAttribute('rel');
		if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
			$ret['feed_atom'] = $item->getAttribute('href');
		if(substr($x,0,5) == "dfrn-")
			$ret[$x] = $item->getAttribute('href');
		if($x === 'lrdd') {
			$decoded = urldecode($item->getAttribute('href'));
			if(preg_match('/acct:([^@]*)@/',$decoded,$matches))
				$ret['nick'] = $matches[1];
		}
	}

	// Pull out hCard profile elements

	$items = $dom->getElementsByTagName('*');
	foreach($items as $item) {
		if(attribute_contains($item->getAttribute('class'), 'vcard')) {
			$level2 = $item->getElementsByTagName('*');
			foreach($level2 as $x) {
				if(attribute_contains($x->getAttribute('class'),'fn'))
					$ret['fn'] = $x->textContent;
				if(attribute_contains($x->getAttribute('class'),'photo'))
					$ret['photo'] = $x->getAttribute('src');
				if(attribute_contains($x->getAttribute('class'),'key'))
					$ret['key'] = $x->textContent;
			}
		}
	}

	return $ret;
}}


if(! function_exists('validate_dfrn')) {
function validate_dfrn($a) {
	$errors = 0;
	if(! x($a,'key'))
		$errors ++;
	if(! x($a,'dfrn-request'))
		$errors ++;
	if(! x($a,'dfrn-confirm'))
		$errors ++;
	if(! x($a,'dfrn-notify'))
		$errors ++;
	if(! x($a,'dfrn-poll'))
		$errors ++;
	return $errors;
}}

if(! function_exists('scrape_meta')) {
function scrape_meta($url) {

	$a = get_app();

	$ret = array();

	logger('scrape_meta: url=' . $url);

	$s = fetch_url($url);

	if(! $s) 
		return $ret;

	$headers = $a->get_curl_headers();
	logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG);

	$lines = explode("\n",$headers);
	if(count($lines)) {
		foreach($lines as $line) {				
			// don't try and run feeds through the html5 parser
			if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
				return ret;
		}
	}


	$dom = HTML5_Parser::parse($s);

	if(! $dom)
		return $ret;

	$items = $dom->getElementsByTagName('meta');

	// get DFRN link elements

	foreach($items as $item) {
		$x = $item->getAttribute('name');
		if(substr($x,0,5) == "dfrn-")
			$ret[$x] = $item->getAttribute('content');
	}

	return $ret;
}}


if(! function_exists('scrape_vcard')) {
function scrape_vcard($url) {

	$a = get_app();

	$ret = array();

	logger('scrape_vcard: url=' . $url);

	$s = fetch_url($url);

	if(! $s) 
		return $ret;

	$headers = $a->get_curl_headers();
	$lines = explode("\n",$headers);
	if(count($lines)) {
		foreach($lines as $line) {				
			// don't try and run feeds through the html5 parser
			if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
				return ret;
		}
	}

	$dom = HTML5_Parser::parse($s);

	if(! $dom)
		return $ret;

	// Pull out hCard profile elements

	$items = $dom->getElementsByTagName('*');
	foreach($items as $item) {
		if(attribute_contains($item->getAttribute('class'), 'vcard')) {
			$level2 = $item->getElementsByTagName('*');
			foreach($level2 as $x) {
				if(attribute_contains($x->getAttribute('class'),'fn'))
					$ret['fn'] = $x->textContent;
				if((attribute_contains($x->getAttribute('class'),'photo'))
					|| (attribute_contains($x->getAttribute('class'),'avatar')))
					$ret['photo'] = $x->getAttribute('src');
				if((attribute_contains($x->getAttribute('class'),'nickname'))
					|| (attribute_contains($x->getAttribute('class'),'uid')))
					$ret['nick'] = $x->textContent;
			}
		}
	}

	return $ret;
}}


if(! function_exists('scrape_feed')) {
function scrape_feed($url) {

	$a = get_app();

	$ret = array();
	$s = fetch_url($url);

	if(! $s) 
		return $ret;

	$headers = $a->get_curl_headers();
	logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);

	$lines = explode("\n",$headers);
	if(count($lines)) {
		foreach($lines as $line) {				
			if(stristr($line,'content-type:')) {
				if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) {
					$ret['feed_atom'] = $url;
					return $ret;
				}
 				if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
					$ret['feed_rss'] = $url;
					return $ret;
				}
			}
		}
	}

	$dom = HTML5_Parser::parse($s);

	if(! $dom)
		return $ret;


	$items = $dom->getElementsByTagName('img');

	// get img elements (twitter)

	if($items) {
		foreach($items as $item) {
			$x = $item->getAttribute('id');
			if($x === 'profile-image') {
				$ret['photo'] = $item->getAttribute('src');
			}
		}
	}

	$items = $dom->getElementsByTagName('link');

	// get Atom/RSS link elements, take the first one of either.

	if($items) {
		foreach($items as $item) {
			$x = $item->getAttribute('rel');
			if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml')) {
				if(! x($ret,'feed_atom'))
					$ret['feed_atom'] = $item->getAttribute('href');
			}
			if(($x === 'alternate') && ($item->getAttribute('type') === 'application/rss+xml')) {
				if(! x($ret,'feed_rss'))
					$ret['feed_rss'] = $item->getAttribute('href');
			}
		}	
	}

	return $ret;
}}
Initial checkin 2010-07-01 19:48:07 -04:00			`<?php`

			`require_once('library/HTML5/Parser.php');`

			`if(! function_exists('scrape_dfrn')) {`
			`function scrape_dfrn($url) {`

don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$a = get_app();`

Initial checkin 2010-07-01 19:48:07 -04:00			`$ret = array();`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 17:48:27 -05:00
			`logger('scrape_dfrn: url=' . $url);`

Initial checkin 2010-07-01 19:48:07 -04:00			`$s = fetch_url($url);`

			`if(! $s)`
			`return $ret;`

don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$headers = $a->get_curl_headers();`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 17:48:27 -05:00			`logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);`


don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$lines = explode("\n",$headers);`
			`if(count($lines)) {`
			`foreach($lines as $line) {`
			`// don't try and run feeds through the html5 parser`
			`if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) \|\| (stristr($line,'application/rss+xml'))))`
			`return ret;`
			`}`
			`}`


Initial checkin 2010-07-01 19:48:07 -04:00			`$dom = HTML5_Parser::parse($s);`

			`if(! $dom)`
			`return $ret;`

			`$items = $dom->getElementsByTagName('link');`

			`// get DFRN link elements`

			`foreach($items as $item) {`
			`$x = $item->getAttribute('rel');`
following random feeds 2011-01-23 23:09:34 -05:00			`if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))`
			`$ret['feed_atom'] = $item->getAttribute('href');`
Initial checkin 2010-07-01 19:48:07 -04:00			`if(substr($x,0,5) == "dfrn-")`
			`$ret[$x] = $item->getAttribute('href');`
add nicknames to contact records (going forward and retroactive) 2010-10-23 04:20:26 -04:00			`if($x === 'lrdd') {`
			`$decoded = urldecode($item->getAttribute('href'));`
			`if(preg_match('/acct:([^@]*)@/',$decoded,$matches))`
			`$ret['nick'] = $matches[1];`
			`}`
Initial checkin 2010-07-01 19:48:07 -04:00			`}`

			`// Pull out hCard profile elements`

			`$items = $dom->getElementsByTagName('*');`
			`foreach($items as $item) {`
			`if(attribute_contains($item->getAttribute('class'), 'vcard')) {`
			`$level2 = $item->getElementsByTagName('*');`
			`foreach($level2 as $x) {`
			`if(attribute_contains($x->getAttribute('class'),'fn'))`
			`$ret['fn'] = $x->textContent;`
			`if(attribute_contains($x->getAttribute('class'),'photo'))`
			`$ret['photo'] = $x->getAttribute('src');`
			`if(attribute_contains($x->getAttribute('class'),'key'))`
			`$ret['key'] = $x->textContent;`
two-way subscriptions working with federated social accounts 2010-10-26 00:52:30 -04:00			`}`
Initial checkin 2010-07-01 19:48:07 -04:00			`}`
			`}`

			`return $ret;`
			`}}`






			`if(! function_exists('validate_dfrn')) {`
			`function validate_dfrn($a) {`
			`$errors = 0;`
			`if(! x($a,'key'))`
			`$errors ++;`
			`if(! x($a,'dfrn-request'))`
			`$errors ++;`
			`if(! x($a,'dfrn-confirm'))`
			`$errors ++;`
			`if(! x($a,'dfrn-notify'))`
			`$errors ++;`
			`if(! x($a,'dfrn-poll'))`
			`$errors ++;`
			`return $errors;`
			`}}`

mucho progress on notifier, email style dfrn url's 2010-07-08 10:03:25 -04:00			`if(! function_exists('scrape_meta')) {`
			`function scrape_meta($url) {`
Initial checkin 2010-07-01 19:48:07 -04:00
don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$a = get_app();`

mucho progress on notifier, email style dfrn url's 2010-07-08 10:03:25 -04:00			`$ret = array();`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 17:48:27 -05:00
			`logger('scrape_meta: url=' . $url);`

mucho progress on notifier, email style dfrn url's 2010-07-08 10:03:25 -04:00			`$s = fetch_url($url);`

			`if(! $s)`
			`return $ret;`

don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$headers = $a->get_curl_headers();`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 17:48:27 -05:00			`logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG);`

don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$lines = explode("\n",$headers);`
			`if(count($lines)) {`
			`foreach($lines as $line) {`
			`// don't try and run feeds through the html5 parser`
			`if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) \|\| (stristr($line,'application/rss+xml'))))`
			`return ret;`
			`}`
			`}`



mucho progress on notifier, email style dfrn url's 2010-07-08 10:03:25 -04:00			`$dom = HTML5_Parser::parse($s);`

			`if(! $dom)`
			`return $ret;`
Initial checkin 2010-07-01 19:48:07 -04:00
mucho progress on notifier, email style dfrn url's 2010-07-08 10:03:25 -04:00			`$items = $dom->getElementsByTagName('meta');`

			`// get DFRN link elements`

			`foreach($items as $item) {`
			`$x = $item->getAttribute('name');`
			`if(substr($x,0,5) == "dfrn-")`
			`$ret[$x] = $item->getAttribute('content');`
			`}`

			`return $ret;`
			`}}`
two-way subscriptions working with federated social accounts 2010-10-26 00:52:30 -04:00

			`if(! function_exists('scrape_vcard')) {`
			`function scrape_vcard($url) {`

don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$a = get_app();`

two-way subscriptions working with federated social accounts 2010-10-26 00:52:30 -04:00			`$ret = array();`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 17:48:27 -05:00
			`logger('scrape_vcard: url=' . $url);`

two-way subscriptions working with federated social accounts 2010-10-26 00:52:30 -04:00			`$s = fetch_url($url);`

			`if(! $s)`
			`return $ret;`

don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$headers = $a->get_curl_headers();`
			`$lines = explode("\n",$headers);`
			`if(count($lines)) {`
			`foreach($lines as $line) {`
			`// don't try and run feeds through the html5 parser`
			`if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) \|\| (stristr($line,'application/rss+xml'))))`
			`return ret;`
			`}`
			`}`

two-way subscriptions working with federated social accounts 2010-10-26 00:52:30 -04:00			`$dom = HTML5_Parser::parse($s);`

			`if(! $dom)`
			`return $ret;`

			`// Pull out hCard profile elements`

			`$items = $dom->getElementsByTagName('*');`
			`foreach($items as $item) {`
			`if(attribute_contains($item->getAttribute('class'), 'vcard')) {`
			`$level2 = $item->getElementsByTagName('*');`
			`foreach($level2 as $x) {`
			`if(attribute_contains($x->getAttribute('class'),'fn'))`
			`$ret['fn'] = $x->textContent;`
			`if((attribute_contains($x->getAttribute('class'),'photo'))`
			`\|\| (attribute_contains($x->getAttribute('class'),'avatar')))`
			`$ret['photo'] = $x->getAttribute('src');`
			`if((attribute_contains($x->getAttribute('class'),'nickname'))`
			`\|\| (attribute_contains($x->getAttribute('class'),'uid')))`
			`$ret['nick'] = $x->textContent;`
			`}`
			`}`
			`}`

			`return $ret;`
			`}}`
following random feeds 2011-01-23 23:09:34 -05:00

			`if(! function_exists('scrape_feed')) {`
			`function scrape_feed($url) {`

don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$a = get_app();`

following random feeds 2011-01-23 23:09:34 -05:00			`$ret = array();`
			`$s = fetch_url($url);`

			`if(! $s)`
			`return $ret;`

don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$headers = $a->get_curl_headers();`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 17:48:27 -05:00			`logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);`

don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$lines = explode("\n",$headers);`
			`if(count($lines)) {`
			`foreach($lines as $line) {`
			`if(stristr($line,'content-type:')) {`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 17:48:27 -05:00			`if(stristr($line,'application/atom+xml') \|\| stristr($s,'<feed')) {`
don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$ret['feed_atom'] = $url;`
			`return $ret;`
			`}`
suppress some scraping errors when confronted with hybrid/strange feeds that provide insufficient content-type and choke the html parser. 2011-02-02 17:48:27 -05:00			`if(stristr($line,'application/rss+xml') \|\| stristr($s,'<rss')) {`
don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`$ret['feed_rss'] = $url;`
better handling of troublesome feeds. 2011-04-04 22:36:18 -04:00			`return $ret;`
don't try to scrape atom/rss feeds. missing param in contact store (mod_follow) 2011-02-01 17:55:29 -05:00			`}`
			`}`
			`}`
			`}`

following random feeds 2011-01-23 23:09:34 -05:00			`$dom = HTML5_Parser::parse($s);`

			`if(! $dom)`
			`return $ret;`

better twitter follow support 2011-04-04 23:40:31 -04:00
			`$items = $dom->getElementsByTagName('img');`

			`// get img elements (twitter)`

			`if($items) {`
			`foreach($items as $item) {`
			`$x = $item->getAttribute('id');`
			`if($x === 'profile-image') {`
			`$ret['photo'] = $item->getAttribute('src');`
			`}`
			`}`
			`}`

following random feeds 2011-01-23 23:09:34 -05:00			`$items = $dom->getElementsByTagName('link');`

better twitter follow support 2011-04-04 23:40:31 -04:00			`// get Atom/RSS link elements, take the first one of either.`
following random feeds 2011-01-23 23:09:34 -05:00
better twitter follow support 2011-04-04 23:40:31 -04:00			`if($items) {`
			`foreach($items as $item) {`
			`$x = $item->getAttribute('rel');`
			`if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml')) {`
			`if(! x($ret,'feed_atom'))`
			`$ret['feed_atom'] = $item->getAttribute('href');`
			`}`
			`if(($x === 'alternate') && ($item->getAttribute('type') === 'application/rss+xml')) {`
			`if(! x($ret,'feed_rss'))`
			`$ret['feed_rss'] = $item->getAttribute('href');`
			`}`
			`}`
following random feeds 2011-01-23 23:09:34 -05:00			`}`

			`return $ret;`
			`}}`