friendica/src/Content/PageInfo.php

<?php
/**
 * @copyright Copyright (C) 2020, Friendica
 *
 * @license GNU AGPL version 3 or any later version
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 */

namespace Friendica\Content;

use Friendica\Core\Hook;
use Friendica\Core\Logger;
use Friendica\DI;
use Friendica\Network\HTTPException;
use Friendica\Util\ParseUrl;
use Friendica\Util\Strings;

/**
 * Extracts trailing URLs from post bodies to transform them in enriched attachment tags through Site Info query
 */
class PageInfo
{
	/**
	 * @param string $body
	 * @param bool   $searchNakedUrls
	 * @param bool   $no_photos
	 * @return string
	 * @throws HTTPException\InternalServerErrorException
	 */
	public static function searchAndAppendToBody(string $body, bool $searchNakedUrls = false, bool $no_photos = false)
	{
		Logger::info('add_page_info_to_body: fetch page info for body', ['body' => $body]);

		$url = self::getRelevantUrlFromBody($body, $searchNakedUrls);
		if (!$url) {
			return $body;
		}

		$data = self::queryUrl($url);
		if (!$data) {
			return $body;
		}

		return self::appendDataToBody($body, $data, $no_photos);
	}

	/**
	 * @param string $body
	 * @param array  $data
	 * @param bool   $no_photos
	 * @return string
	 * @throws HTTPException\InternalServerErrorException
	 */
	public static function appendDataToBody(string $body, array $data, bool $no_photos = false)
	{
		// Only one [attachment] tag per body is allowed
		$existingAttachmentPos = strpos($body, '[attachment');
		if ($existingAttachmentPos !== false) {
			$linkTitle = $data['title'] ?: $data['url'];
			// Additional link attachments are prepended before the existing [attachment] tag
			$body = substr_replace($body, "\n[bookmark=" . $data['url'] . ']' . $linkTitle . "[/bookmark]\n", $existingAttachmentPos, 0);
		} else {
			$footer = PageInfo::getFooterFromData($data, $no_photos);
			$body = self::stripTrailingUrlFromBody($body, $data['url']);
			$body .= "\n" . $footer;
		}

		return $body;
	}

	/**
	 * @param string $url
	 * @param bool $no_photos
	 * @param string $photo
	 * @param bool $keywords
	 * @param string $keyword_denylist
	 * @return string
	 * @throws HTTPException\InternalServerErrorException
	 */
	public static function getFooterFromUrl(string $url, bool $no_photos = false, string $photo = '', bool $keywords = false, string $keyword_denylist = '')
	{
		$data = self::queryUrl($url, $photo, $keywords, $keyword_denylist);

		return self::getFooterFromData($data, $no_photos);
	}

	/**
	 * @param array $data
	 * @param bool  $no_photos
	 * @return string
	 * @throws HTTPException\InternalServerErrorException
	 */
	public static function getFooterFromData(array $data, bool $no_photos = false)
	{
		Hook::callAll('page_info_data', $data);

		if (empty($data['type'])) {
			return '';
		}

		// It maybe is a rich content, but if it does have everything that a link has,
		// then treat it that way
		if (($data['type'] == 'rich') && is_string($data['title']) &&
			is_string($data['text']) && !empty($data['images'])) {
			$data['type'] = 'link';
		}

		$data['title'] = $data['title'] ?? '';

		if ((($data['type'] != 'link') && ($data['type'] != 'video') && ($data['type'] != 'photo')) || ($data['title'] == $data['url'])) {
			return '';
		}

		if ($no_photos && ($data['type'] == 'photo')) {
			return '';
		}

		// Escape some bad characters
		$text = "[attachment";

		foreach (['type', 'url', 'title', 'alternative_title', 'publisher_name', 'publisher_url', 'publisher_img', 'author_name', 'author_url', 'author_img'] as $field) {
			if (!empty($data[$field])) {
				$text .= " " . $field . "='" . str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data[$field], ENT_QUOTES, 'UTF-8', false)) . "'";
			}
		}

		if (empty($data['text'])) {
			$data['text'] = '';
		}

		// Only embedd a picture link when it seems to be a valid picture ("width" is set)
		if (!empty($data['images']) && !empty($data['images'][0]['width'])) {
			$preview = str_replace(['[', ']'], ['&#91;', '&#93;'], htmlentities($data['images'][0]['src'], ENT_QUOTES, 'UTF-8', false));
			// if the preview picture is larger than 500 pixels then show it in a larger mode
			// But only, if the picture isn't higher than large (To prevent huge posts)
			if (!DI::config()->get('system', 'always_show_preview') && ($data['images'][0]['width'] >= 500)
				&& ($data['images'][0]['width'] >= $data['images'][0]['height'])) {
				$text .= " image='" . $preview . "'";
			} else {
				$text .= " preview='" . $preview . "'";

				if (empty($data['text'])) {
					$data['text'] = $data['title'];
				}
		
				if (empty($data['text'])) {
					$data['text'] = $data['url'];
				}
			}
		}

		$text .= ']' . str_replace(['[', ']'], ['&#91;', '&#93;'], $data['text']) . '[/attachment]';

		$hashtags = '';
		if (!empty($data['keywords'])) {
			$hashtags = "\n";
			foreach ($data['keywords'] as $keyword) {
				/// @TODO make a positive list of allowed characters
				$hashtag = str_replace([' ', '+', '/', '.', '#', '@', "'", '"', '’', '`', '(', ')', '„', '“'], '', $keyword);
				$hashtags .= '#[url=' . DI::baseUrl() . '/search?tag=' . $hashtag . ']' . $hashtag . '[/url] ';
			}
		}

		return $text . $hashtags;
	}

	/**
	 * @param string  $url
	 * @param string $photo
	 * @param bool $keywords
	 * @param string $keyword_denylist
	 * @return array|bool
	 * @throws HTTPException\InternalServerErrorException
	 */
	public static function queryUrl(string $url, string $photo = '', bool $keywords = false, string $keyword_denylist = '')
	{
		$data = ParseUrl::getSiteinfoCached($url);

		if ($photo != '') {
			$data['images'][0]['src'] = $photo;
		}

		if (!$keywords) {
			unset($data['keywords']);
		} elseif ($keyword_denylist && !empty($data['keywords'])) {
			$list = explode(', ', $keyword_denylist);

			foreach ($list as $keyword) {
				$keyword = trim($keyword);

				$index = array_search($keyword, $data['keywords']);
				if ($index !== false) {
					unset($data['keywords'][$index]);
				}
			}
		}

		Logger::info('fetch page info for URL', ['url' => $url, 'data' => $data]);

		return $data;
	}

	/**
	 * @param string $url
	 * @param string $photo
	 * @param string $keyword_denylist
	 * @return array
	 * @throws HTTPException\InternalServerErrorException
	 */
	public static function getTagsFromUrl(string $url, string $photo = '', string $keyword_denylist = '')
	{
		$data = self::queryUrl($url, $photo, true, $keyword_denylist);

		if (empty($data['keywords'])) {
			return [];
		}

		$taglist = [];
		foreach ($data['keywords'] as $keyword) {
			$hashtag = str_replace([' ', '+', '/', '.', '#', "'"],
				['', '', '', '', '', ''], $keyword);

			$taglist[] = $hashtag;
		}

		return $taglist;
	}

	/**
	 * Picks a non-hashtag, non-mention, schemeful URL at the end of the provided body string to be converted into Page Info.
	 *
	 * @param string $body
	 * @param bool   $searchNakedUrls Whether we should pick a naked URL (outside of BBCode tags) as a last resort
	 * @return string|null
	 */
	protected static function getRelevantUrlFromBody(string $body, bool $searchNakedUrls = false)
	{
		$URLSearchString = 'https?://[^\[\]]*';

		// Fix for Mastodon where the mentions are in a different format
		$body = preg_replace("~\[url=($URLSearchString)]([#!@])(.*?)\[/url]~is", '$2[url=$1]$3[/url]', $body);

		preg_match("~(?<![!#@])\[url]($URLSearchString)\[/url]$~is", $body, $matches);

		if (!$matches) {
			preg_match("~(?<![!#@])\[url=($URLSearchString)].*\[/url]$~is", $body, $matches);
		}

		if (!$matches && $searchNakedUrls) {
			preg_match(Strings::autoLinkRegEx(), $body, $matches);
			if ($matches && !Strings::endsWith($body, $matches[1])) {
				unset($matches);
			}
		}

		return $matches[1] ?? null;
	}

	/**
	 * Remove the provided URL from the body if it is at the end of it.
	 * Keep the link label if it isn't the full URL or a shortened version of it.
	 *
	 * @param string $body
	 * @param string $url
	 * @return string
	 */
	protected static function stripTrailingUrlFromBody(string $body, string $url)
	{
		$quotedUrl = preg_quote($url, '#');
		$body = preg_replace_callback("#(?:
			\[url]$quotedUrl\[/url]|
			\[url=$quotedUrl]$quotedUrl\[/url]|
			\[url=$quotedUrl]([^[]*?)\[/url]|
			$quotedUrl
		)$#isx", function ($match) use ($url) {
			// Stripping URLs with no label
			if (empty($match[1])) {
				return '';
			}

			// Stripping link labels that include a shortened version of the URL
			if (strpos($url, trim($match[1], '.…')) !== false) {
				return '';
			}

			// Keep all other labels
			return $match[1];
		}, $body);

		return rtrim($body);
	}
}