parse_url: Further improvements of the new method to fetch page data

2012-07-12 23:41:04 +02:00
parent 09034ce0ee
commit 02a1fc9cd0
2 changed files with 66 additions and 41 deletions
--- a/include/api.php
+++ b/include/api.php
@@ -1727,5 +1727,6 @@ notifications/follow
 notifications/leave
 blocks/exists
 blocks/blocking
+lists
 */

--- a/mod/parse_url.php
+++ b/mod/parse_url.php
@@ -1,6 +1,4 @@
 <?php
-require_once('include/Photo.php');
-
 if(!function_exists('deletenode')) {
 	function deletenode(&$doc, $node)
 	{
@@ -11,6 +9,30 @@ if(!function_exists('deletenode')) {
 	}
 }

+function completeurl($url, $scheme) {
+        $urlarr = parse_url($url);
+
+        if (isset($urlarr["scheme"]))
+                return($url);
+
+        $schemearr = parse_url($scheme);
+
+        $complete = $schemearr["scheme"]."://".$schemearr["host"];
+
+        if ($schemearr["port"] != "")
+                $complete .= ":".$schemearr["port"];
+
+        $complete .= $urlarr["path"];
+
+        if ($urlarr["query"] != "")
+                $complete .= "?".$urlarr["query"];
+
+        if ($urlarr["fragment"] != "")
+                $complete .= "#".$urlarr["fragment"];
+
+        return($complete);
+}
+
 function parseurl_getsiteinfo($url) {
 	$siteinfo = array();

@@ -25,7 +47,8 @@ function parseurl_getsiteinfo($url) {
 	$header = curl_exec($ch);
 	curl_close($ch);

-	if (preg_match('/charset=(.*?)\n/', $header, $matches))
+	// Fetch the first mentioned charset. Can be in body or header
+	if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
 		$charset = trim(array_pop($matches));
 	else
 		$charset = "utf-8";
@@ -57,11 +80,13 @@ function parseurl_getsiteinfo($url) {

 	$xpath = new DomXPath($doc);

-	$list = $xpath->query("head/title");
+	//$list = $xpath->query("head/title");
+	$list = $xpath->query("//title");
 	foreach ($list as $node)
 		$siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");

-	$list = $xpath->query("head/meta[@name]");
+	//$list = $xpath->query("head/meta[@name]");
+	$list = $xpath->query("//meta[@name]");
 	foreach ($list as $node) {
 		$attr = array();
 		if ($node->attributes->length)
@@ -86,7 +111,8 @@ function parseurl_getsiteinfo($url) {
 		}
 	}

-	$list = $xpath->query("head/meta[@property]");
+	//$list = $xpath->query("head/meta[@property]");
+	$list = $xpath->query("//meta[@property]");
 	foreach ($list as $node) {
 		$attr = array();
 		if ($node->attributes->length)
@@ -116,38 +142,32 @@ function parseurl_getsiteinfo($url) {
                                foreach ($node->attributes as $attribute)
                                        $attr[$attribute->name] = $attribute->value;

-                        // guess mimetype from headers or filename
-                        $type = guess_image_type($attr["src"],true);
+			$src = completeurl($attr["src"], $url);
+			$photodata = getimagesize($src);

-                        $i = fetch_url($attr["src"]);
-                        $ph = new Photo($i, $type);
-
-			if($ph->is_valid() and ($ph->getWidth() > 200) and ($ph->getHeight() > 200)) {
-				if ($siteinfo["image"] == "")
-	                                $siteinfo["image"] = $attr["src"];
-
-				if($ph->getWidth() > 300 || $ph->getHeight() > 300) {
-					$ph->scaleImage(300);
-	                                $siteinfo["images"][] = array("src"=>$attr["src"],
-									"width"=>$ph->getWidth(),
-									"height"=>$ph->getHeight());
-				} else
-	                                $siteinfo["images"][] = array("src"=>$attr["src"],
-									"width"=>$ph->getWidth(),
-									"height"=>$ph->getHeight());
+			if (($photodata[0] > 150) and ($photodata[1] > 150)) {
+				if ($photodata[0] > 300) {
+					$photodata[1] = $photodata[1] * (300 / $photodata[0]);
+					$photodata[0] = 300;
+				}
+				if ($photodata[1] > 300) {
+					$photodata[0] = $photodata[0] * (300 / $photodata[1]);
+					$photodata[1] = 300;
+				}
+				$siteinfo["images"][] = array("src"=>$src,
+								"width"=>$photodata[0],
+								"height"=>$photodata[1]);
 			}
+
                }
        } else {
-		// guess mimetype from headers or filename
-                $type = guess_image_type($siteinfo["image"],true);
+		$src = completeurl($siteinfo["image"], $url);
+		$photodata = getimagesize($src);

-                $i = fetch_url($siteinfo["image"]);
-                $ph = new Photo($i, $type);
-
-		if($ph->is_valid())
-			$siteinfo["images"][] = array("src"=>$siteinfo["image"],
-							"width"=>$ph->getWidth(),
-							"height"=>$ph->getHeight());
+		if (($photodata[0] > 10) and ($photodata[1] > 10))
+			$siteinfo["images"][] = array("src"=>$src,
+							"width"=>$photodata[0],
+							"height"=>$photodata[1]);
 	}

 	if ($siteinfo["text"] == "") {
@@ -155,19 +175,22 @@ function parseurl_getsiteinfo($url) {

 		$list = $xpath->query("//div[@class='article']");
 		foreach ($list as $node)
-			$text .= " ".trim($node->nodeValue);
+			if (strlen($node->nodeValue) > 40)
+				$text .= " ".trim($node->nodeValue);

 		if ($text == "") {
 			$list = $xpath->query("//div[@class='content']");
 			foreach ($list as $node)
-				$text .= " ".trim($node->nodeValue);
+				if (strlen($node->nodeValue) > 40)
+					$text .= " ".trim($node->nodeValue);
 		}

 		// If none text was found then take the paragraph content
 		if ($text == "") {
 			$list = $xpath->query("//p");
 			foreach ($list as $node)
-				$text .= " ".trim($node->nodeValue);
+				if (strlen($node->nodeValue) > 40)
+					$text .= " ".trim($node->nodeValue);
 		}

 		if ($text != "") {
@@ -238,9 +261,9 @@ function parse_url_content(&$a) {
 	if($url && $title && $text) {

 		if($textmode)
-			$text = $br . $br . '[quote]' . trim($text) . '[/quote]' . $br;
+			$text = $br . '[quote]' . trim($text) . '[/quote]' . $br;
 		else
-			$text = '<br /><br /><blockquote>' . trim($text) . '</blockquote><br />';
+			$text = '<br /><blockquote>' . trim($text) . '</blockquote><br />';

 		$title = str_replace(array("\r","\n"),array('',''),$title);

@@ -255,7 +278,8 @@ function parse_url_content(&$a) {
 	$siteinfo = parseurl_getsiteinfo($url);

 	if($siteinfo["title"] == "") {
-		echo sprintf($template,$url,$url,'') . $str_tags;
+		echo print_r($siteinfo, true);
+		//echo sprintf($template,$url,$url,'') . $str_tags;
 		killme();
 	} else {
 		$text = $siteinfo["text"];
@@ -305,7 +329,7 @@ function parse_url_content(&$a) {
 	}

 	if($image) {
-		$text = $br.$br.$image.$br.$text;
+		$text = $br.$br.$image.$text;
 	}
 	$title = str_replace(array("\r","\n"),array('',''),$title);

@@ -313,6 +337,6 @@ function parse_url_content(&$a) {

 	logger('parse_url: returns: ' . $result);

-	echo $result;
+	echo trim($result);
 	killme();
 }