From 8385ee7a612b33c2e2af2533d922f4a8a21e6dd4 Mon Sep 17 00:00:00 2001 From: Philipp Date: Mon, 23 Aug 2021 14:28:25 +0200 Subject: [PATCH] Use mattwright/urlresolver for HTTPClient::finalUrl() --- composer.json | 3 +- composer.lock | 48 ++++++++++- src/Factory/HTTPClientFactory.php | 10 ++- src/Network/HTTPClient.php | 131 ++++++------------------------ src/Network/IHTTPClient.php | 6 +- 5 files changed, 83 insertions(+), 115 deletions(-) diff --git a/composer.json b/composer.json index 2dd5dec7b9..bf0559254e 100644 --- a/composer.json +++ b/composer.json @@ -69,7 +69,8 @@ "npm-asset/perfect-scrollbar": "0.6.16", "npm-asset/textcomplete": "^0.18.2", "npm-asset/typeahead.js": "^0.11.1", - "minishlink/web-push": "^6.0" + "minishlink/web-push": "^6.0", + "mattwright/urlresolver": "^2.0" }, "repositories": [ { diff --git a/composer.lock b/composer.lock index 5e8f1a20aa..906a681e45 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "7d6dee6e449da931e8fe209e61b2e78e", + "content-hash": "c9e0a9eacc23d884012042eeab01cc8b", "packages": [ { "name": "asika/simple-console", @@ -1133,6 +1133,52 @@ ], "time": "2017-07-19T15:11:19+00:00" }, + { + "name": "mattwright/urlresolver", + "version": "2.0", + "source": { + "type": "git", + "url": "https://github.com/mattwright/URLResolver.php.git", + "reference": "416039192cb6d9158bdacd68349bceff8739b857" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/mattwright/URLResolver.php/zipball/416039192cb6d9158bdacd68349bceff8739b857", + "reference": "416039192cb6d9158bdacd68349bceff8739b857", + "shasum": "" + }, + "require": { + "ext-curl": "*", + "ext-mbstring": "*", + "php": ">=5.3" + }, + "type": "library", + "autoload": { + "psr-4": { + "mattwright\\": "." + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Matt Wright", + "email": "mw@mattwright.com" + } + ], + "description": "PHP class that attempts to resolve URLs to a final, canonical link.", + "homepage": "https://github.com/mattwright/URLResolver.php", + "keywords": [ + "canonical", + "link", + "redirect", + "resolve", + "url" + ], + "time": "2019-01-18T00:59:34+00:00" + }, { "name": "michelf/php-markdown", "version": "1.9.0", diff --git a/src/Factory/HTTPClientFactory.php b/src/Factory/HTTPClientFactory.php index 636f8a46d9..c1cb475414 100644 --- a/src/Factory/HTTPClientFactory.php +++ b/src/Factory/HTTPClientFactory.php @@ -10,6 +10,7 @@ use Friendica\Network\IHTTPClient; use Friendica\Util\Profiler; use GuzzleHttp\Client; use GuzzleHttp\RequestOptions; +use mattwright\URLResolver; use Psr\Http\Message\RequestInterface; use Psr\Http\Message\ResponseInterface; use Psr\Http\Message\UriInterface; @@ -85,6 +86,13 @@ class HTTPClientFactory extends BaseFactory ], ]); - return new HTTPClient($logger, $this->profiler, $this->config, $userAgent, $guzzle); + $resolver = new URLResolver(); + $resolver->setUserAgent($userAgent); + $resolver->setMaxRedirects(10); + $resolver->setRequestTimeout(10); + // if the file is too large then exit + $resolver->setMaxResponseDataSize(1000000); + + return new HTTPClient($logger, $this->profiler, $guzzle, $resolver); } } diff --git a/src/Network/HTTPClient.php b/src/Network/HTTPClient.php index 000d3c76af..d83b805df0 100644 --- a/src/Network/HTTPClient.php +++ b/src/Network/HTTPClient.php @@ -21,9 +21,6 @@ namespace Friendica\Network; -use DOMDocument; -use DomXPath; -use Friendica\Core\Config\IConfig; use Friendica\Core\System; use Friendica\Util\Network; use Friendica\Util\Profiler; @@ -32,6 +29,7 @@ use GuzzleHttp\Cookie\FileCookieJar; use GuzzleHttp\Exception\RequestException; use GuzzleHttp\Exception\TransferException; use GuzzleHttp\RequestOptions; +use mattwright\URLResolver; use Psr\Http\Message\ResponseInterface; use Psr\Log\LoggerInterface; @@ -44,20 +42,17 @@ class HTTPClient implements IHTTPClient private $logger; /** @var Profiler */ private $profiler; - /** @var IConfig */ - private $config; - /** @var string */ - private $userAgent; /** @var Client */ private $client; + /** @var URLResolver */ + private $resolver; - public function __construct(LoggerInterface $logger, Profiler $profiler, IConfig $config, string $userAgent, Client $client) + public function __construct(LoggerInterface $logger, Profiler $profiler, Client $client, URLResolver $resolver) { - $this->logger = $logger; - $this->profiler = $profiler; - $this->config = $config; - $this->userAgent = $userAgent; - $this->client = $client; + $this->logger = $logger; + $this->profiler = $profiler; + $this->client = $client; + $this->resolver = $resolver; } /** @@ -97,6 +92,11 @@ class HTTPClient implements IHTTPClient return CurlResult::createErrorCurl($url); } + if (Network::isRedirectBlocked($url)) { + $this->logger->info('Domain should not be redirected.', ['url' => $url]); + return CurlResult::createErrorCurl($url); + } + $conf = []; if (!empty($opts['cookiejar'])) { @@ -197,10 +197,12 @@ class HTTPClient implements IHTTPClient /** * {@inheritDoc} */ - public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false) + public function finalUrl(string $url) { + $this->profiler->startRecording('network'); + if (Network::isLocalLink($url)) { - $this->logger->info('Local link', ['url' => $url, 'callstack' => System::callstack(20)]); + $this->logger->debug('Local link', ['url' => $url, 'callstack' => System::callstack(20)]); } if (Network::isUrlBlocked($url)) { @@ -215,104 +217,19 @@ class HTTPClient implements IHTTPClient $url = Network::stripTrackingQueryParams($url); - if ($depth > 10) { - return $url; - } - $url = trim($url, "'"); - $this->profiler->startRecording('network'); + // Designate a temporary file that will store cookies during the session. + // Some websites test the browser for cookie support, so this enhances results. + $this->resolver->setCookieJar(tempnam(get_temppath() , 'url_resolver-')); - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HEADER, 1); - curl_setopt($ch, CURLOPT_NOBODY, 1); - curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); - curl_setopt($ch, CURLOPT_TIMEOUT, 10); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent); + $urlResult = $this->resolver->resolveURL($url); - curl_exec($ch); - $curl_info = @curl_getinfo($ch); - $http_code = $curl_info['http_code']; - curl_close($ch); - - $this->profiler->stopRecording(); - - if ($http_code == 0) { - return $url; + if ($urlResult->didErrorOccur()) { + throw new TransferException($urlResult->getErrorMessageString()); } - if (in_array($http_code, ['301', '302'])) { - if (!empty($curl_info['redirect_url'])) { - return $this->finalUrl($curl_info['redirect_url'], ++$depth, $fetchbody); - } elseif (!empty($curl_info['location'])) { - return $this->finalUrl($curl_info['location'], ++$depth, $fetchbody); - } - } - - // Check for redirects in the meta elements of the body if there are no redirects in the header. - if (!$fetchbody) { - return $this->finalUrl($url, ++$depth, true); - } - - // if the file is too large then exit - if ($curl_info["download_content_length"] > 1000000) { - return $url; - } - - // if it isn't a HTML file then exit - if (!empty($curl_info["content_type"]) && !strstr(strtolower($curl_info["content_type"]), "html")) { - return $url; - } - - $this->profiler->startRecording('network'); - - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HEADER, 0); - curl_setopt($ch, CURLOPT_NOBODY, 0); - curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); - curl_setopt($ch, CURLOPT_TIMEOUT, 10); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent); - - $body = curl_exec($ch); - curl_close($ch); - - $this->profiler->stopRecording(); - - if (trim($body) == "") { - return $url; - } - - // Check for redirect in meta elements - $doc = new DOMDocument(); - @$doc->loadHTML($body); - - $xpath = new DomXPath($doc); - - $list = $xpath->query("//meta[@content]"); - foreach ($list as $node) { - $attr = []; - if ($node->attributes->length) { - foreach ($node->attributes as $attribute) { - $attr[$attribute->name] = $attribute->value; - } - } - - if (@$attr["http-equiv"] == 'refresh') { - $path = $attr["content"]; - $pathinfo = explode(";", $path); - foreach ($pathinfo as $value) { - if (substr(strtolower($value), 0, 4) == "url=") { - return $this->finalUrl(substr($value, 4), ++$depth); - } - } - } - } - - return $url; + return $urlResult->getURL(); } /** diff --git a/src/Network/IHTTPClient.php b/src/Network/IHTTPClient.php index 8fa5285d26..180908eede 100644 --- a/src/Network/IHTTPClient.php +++ b/src/Network/IHTTPClient.php @@ -104,14 +104,10 @@ interface IHTTPClient * through HTTP code or meta refresh tags. Stops after 10 redirections. * * @param string $url A user-submitted URL - * @param int $depth The current redirection recursion level (internal) - * @param bool $fetchbody Wether to fetch the body or not after the HEAD requests * * @return string A canonical URL * @throws \Friendica\Network\HTTPException\InternalServerErrorException * @see ParseUrl::getSiteinfo - * - * @todo Remove the $fetchbody parameter that generates an extraneous HEAD request */ - public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false); + public function finalUrl(string $url); }