More general content type detection

This commit is contained in:
Michael 2021-03-13 13:17:42 +00:00
parent acffafe6b9
commit 7adbd73eca

View File

@ -54,25 +54,21 @@ class ParseUrl
/** /**
* Fetch the content type of the given url * Fetch the content type of the given url
* @param string $url URL of the page * @param string $url URL of the page
* @return string content type * @return array content type
*/ */
public static function getContentType(string $url) public static function getContentType(string $url)
{ {
$curlResult = DI::httpRequest()->head($url); $curlResult = DI::httpRequest()->head($url);
if (!$curlResult->isSuccess()) { if (!$curlResult->isSuccess()) {
return ''; return [];
} }
$contenttype = $curlResult->getHeader('Content-Type'); $contenttype = $curlResult->getHeader('Content-Type');
if (empty($contenttype)) { if (empty($contenttype)) {
return ''; return [];
} }
if (!preg_match('#(image|video|audio)/#i', $contenttype, $matches)) { return explode('/', current(explode(';', $contenttype)));
return '';
}
return array_pop($matches);
} }
/** /**
@ -211,8 +207,14 @@ class ParseUrl
} }
$type = self::getContentType($url); $type = self::getContentType($url);
if (in_array($type, ['image', 'video', 'audio'])) { Logger::info('Got content-type', ['content-type' => $type, 'url' => $url]);
$siteinfo['type'] = $type; if (!empty($type) && in_array($type[0], ['image', 'video', 'audio'])) {
$siteinfo['type'] = $type[0];
return $siteinfo;
}
if ((count($type) >= 2) && (($type[0] != 'text') || ($type[1] != 'html'))) {
Logger::info('Unparseable content-type, quitting here, ', ['content-type' => $type, 'url' => $url]);
return $siteinfo; return $siteinfo;
} }
@ -228,21 +230,6 @@ class ParseUrl
return $siteinfo; return $siteinfo;
} }
// Native media type, no need for HTML parsing
$type = $curlResult->getHeader('Content-Type');
if ($type) {
preg_match('#(image|video|audio)/#i', $type, $matches);
if ($matches) {
$siteinfo['type'] = array_pop($matches);
return $siteinfo;
}
}
// If it isn't a HTML file then exit
if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) {
return $siteinfo;
}
if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) { if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) {
if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) { if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) {
$maxAge = max(86400, (int)array_pop($matches)); $maxAge = max(86400, (int)array_pop($matches));