The "scrape" bugfix lead to partly damaged encodings. This is fixed now.

2015-01-04 11:56:41 +01:00
parent 3195bacd9e
commit a86c143e24
5 changed files with 19 additions and 11 deletions
@@ -20,7 +20,12 @@ class HTML5_Parser

 	// Cleanup invalid HTML
 	$doc = new DOMDocument();
-	@$doc->loadHTML($text);
+
+	if (mb_detect_encoding($text, "UTF-8", true) == "UTF-8")
+		@$doc->loadHTML('<?xml encoding="UTF-8" ?>'.$text);
+	else
+		@$doc->loadHTML($text);
+
 	$text = $doc->saveHTML();

        $tokenizer = new HTML5_Tokenizer($text, $builder);