From 69802554fdfe9272fd05f3e55379286eb3339011 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Tue, 16 Feb 2021 10:16:04 -0500 Subject: [PATCH] Add native media types and expiration to getSiteInfo --- mod/parse_url.php | 25 --------------- src/Util/ParseUrl.php | 64 +++++++++++++++++++++++++++++++++------ src/Worker/ClearCache.php | 4 +-- 3 files changed, 56 insertions(+), 37 deletions(-) diff --git a/mod/parse_url.php b/mod/parse_url.php index 82325aa553..83997958d9 100644 --- a/mod/parse_url.php +++ b/mod/parse_url.php @@ -180,28 +180,3 @@ function parse_url_content(App $a) exit(); } - -/** - * Legacy function to call ParseUrl::getSiteinfoCached - * - * Note: We have moved the function to ParseUrl.php. This function is only for - * legacy support and will be remove in the future - * - * @param string $url The url of the page which should be scraped - * @param bool $no_guessing If true the parse doens't search for - * preview pictures - * @param bool $do_oembed The false option is used by the function fetch_oembed() - * to avoid endless loops - * - * @return array which contains needed data for embedding - * - * @throws \Friendica\Network\HTTPException\InternalServerErrorException - * @see ParseUrl::getSiteinfoCached() - * - * @deprecated since version 3.6 use ParseUrl::getSiteinfoCached instead - */ -function parseurl_getsiteinfo_cached($url, $no_guessing = false, $do_oembed = true) -{ - $siteinfo = ParseUrl::getSiteinfoCached($url, $no_guessing, $do_oembed); - return $siteinfo; -} diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php index 15186b5737..de280bcf85 100644 --- a/src/Util/ParseUrl.php +++ b/src/Util/ParseUrl.php @@ -29,6 +29,7 @@ use Friendica\Core\Logger; use Friendica\Database\Database; use Friendica\Database\DBA; use Friendica\DI; +use Friendica\Network\HTTPException; /** * Get information about a given URL @@ -37,6 +38,9 @@ use Friendica\DI; */ class ParseUrl { + const DEFAULT_EXPIRATION_FAILURE = 'now + 1 day'; + const DEFAULT_EXPIRATION_SUCCESS = 'now + 3 months'; + /** * Maximum number of characters for the description */ @@ -65,18 +69,23 @@ class ParseUrl * array 'images' => (optional) Array of preview pictures * string 'keywords' => (optional) The tags which belong to the content * - * @throws \Friendica\Network\HTTPException\InternalServerErrorException + * @throws HTTPException\InternalServerErrorException * @see ParseUrl::getSiteinfo() for more information about scraping * embeddable content */ - public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) + public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true): array { - if ($url == "") { - return false; + if (empty($url)) { + return [ + 'url' => '', + 'type' => 'error', + ]; } + $urlHash = hash('sha256', $url); + $parsed_url = DBA::selectFirst('parsed_url', ['content'], - ['url' => Strings::normaliseLink($url), 'guessing' => !$no_guessing, 'oembed' => $do_oembed] + ['url_hash' => $urlHash, 'guessing' => !$no_guessing, 'oembed' => $do_oembed] ); if (!empty($parsed_url['content'])) { $data = unserialize($parsed_url['content']); @@ -85,12 +94,20 @@ class ParseUrl $data = self::getSiteinfo($url, $no_guessing, $do_oembed); - DBA::insert( + $expires = $data['expires']; + + unset($data['expires']); + + DI::dba()->insert( 'parsed_url', [ - 'url' => substr(Strings::normaliseLink($url), 0, 255), 'guessing' => !$no_guessing, - 'oembed' => $do_oembed, 'content' => serialize($data), - 'created' => DateTimeFormat::utcNow() + 'url_hash' => $urlHash, + 'guessing' => !$no_guessing, + 'oembed' => $do_oembed, + 'url' => $url, + 'content' => serialize($data), + 'created' => DateTimeFormat::utcNow(), + 'expires' => $expires, ], Database::INSERT_UPDATE ); @@ -117,7 +134,7 @@ class ParseUrl * * @return array which contains needed data for embedding * string 'url' => The url of the parsed page - * string 'type' => Content type + * string 'type' => Content type (error, link, photo, image, audio, video) * string 'title' => (optional) The title of the content * string 'text' => (optional) The description for the content * string 'image' => (optional) A preview image of the content (only available if $no_guessing = false) @@ -140,6 +157,13 @@ class ParseUrl */ public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) { + if (empty($url)) { + return [ + 'url' => '', + 'type' => 'error', + ]; + } + // Check if the URL does contain a scheme $scheme = parse_url($url, PHP_URL_SCHEME); @@ -154,6 +178,7 @@ class ParseUrl $siteinfo = [ 'url' => $url, 'type' => 'link', + 'expires' => DateTimeFormat::utc(self::DEFAULT_EXPIRATION_FAILURE), ]; if ($count > 10) { @@ -166,16 +191,35 @@ class ParseUrl return $siteinfo; } + $siteinfo['expires'] = DateTimeFormat::utc(self::DEFAULT_EXPIRATION_SUCCESS); + // If the file is too large then exit if (($curlResult->getInfo()['download_content_length'] ?? 0) > 1000000) { return $siteinfo; } + // Native media type, no need for HTML parsing + $type = $curlResult->getHeader('Content-Type'); + if ($type) { + preg_match('#(image|video|audio)/#i', $type, $matches); + if ($matches) { + $siteinfo['type'] = array_pop($matches); + return $siteinfo; + } + } + // If it isn't a HTML file then exit if (($curlResult->getContentType() != '') && !strstr(strtolower($curlResult->getContentType()), 'html')) { return $siteinfo; } + if ($cacheControlHeader = $curlResult->getHeader('Cache-Control')) { + if (preg_match('/max-age=([0-9]+)/i', $cacheControlHeader, $matches)) { + $maxAge = max(86400, (int)array_pop($matches)); + $siteinfo['expires'] = DateTimeFormat::utc("now + $maxAge seconds"); + } + } + $header = $curlResult->getHeader(); $body = $curlResult->getBody(); diff --git a/src/Worker/ClearCache.php b/src/Worker/ClearCache.php index 5eee4c74ab..a836e5bec6 100644 --- a/src/Worker/ClearCache.php +++ b/src/Worker/ClearCache.php @@ -64,7 +64,7 @@ class ClearCache // Delete the cached OEmbed entries that are older than three month DBA::delete('oembed', ["`created` < NOW() - INTERVAL 3 MONTH"]); - // Delete the cached "parse_url" entries that are older than three month - DBA::delete('parsed_url', ["`created` < NOW() - INTERVAL 3 MONTH"]); + // Delete the cached "parsed_url" entries that are expired + DBA::delete('parsed_url', ["`expires` < NOW()"]); } }