Merge pull request #13889 from annando/issue-13884

Issue 13884: Sanitation of links in BBCode parser
2024-02-12 15:28:04 -05:00 · 2024-02-12 15:28:04 -05:00 · 59c27a6cbb
parent 2cc8fcc4aa 3b0cc45588
commit 59c27a6cbb
9 changed files with 74 additions and 22 deletions
--- a/src/Content/PageInfo.php
+++ b/src/Content/PageInfo.php
@ -170,7 +170,7 @@ class PageInfo
 			foreach ($data['keywords'] as $keyword) {
 				/// @TODO make a positive list of allowed characters
 				$hashtag = str_replace([' ', '+', '/', '.', '#', '@', "'", '"', '’', '`', '(', ')', '„', '“'], '', $keyword);
-				$hashtags .= '#[url=' . DI::baseUrl() . '/search?tag=' . $hashtag . ']' . $hashtag . '[/url] ';
+				$hashtags .= '#[url=' . DI::baseUrl() . '/search?tag=' . urlencode($hashtag) . ']' . $hashtag . '[/url] ';
 			}
 		}

--- a/src/Content/Text/BBCode.php
+++ b/src/Content/Text/BBCode.php
@ -41,6 +41,7 @@ use Friendica\Model\Tag;
 use Friendica\Network\HTTPClient\Client\HttpClientAccept;
 use Friendica\Network\HTTPClient\Client\HttpClientOptions;
 use Friendica\Util\Map;
+use Friendica\Util\Network;
 use Friendica\Util\ParseUrl;
 use Friendica\Util\Proxy;
 use Friendica\Util\Strings;
@ -124,7 +125,7 @@ class BBCode
 						break;

 					case 'publisher_url':
-						$data['provider_url'] = html_entity_decode($value, ENT_QUOTES, 'UTF-8');
+						$data['provider_url'] = Network::sanitizeUrl(html_entity_decode($value, ENT_QUOTES, 'UTF-8'));
 						break;

 					case 'author_name':
@ -135,7 +136,7 @@ class BBCode
 						break;

 					case 'author_url':
-						$data['author_url'] = html_entity_decode($value, ENT_QUOTES, 'UTF-8');
+						$data['author_url'] = Network::sanitizeUrl(html_entity_decode($value, ENT_QUOTES, 'UTF-8'));
 						if ($data['provider_url'] == $data['author_url']) {
 							$data['author_url'] = '';
 						}
@ -434,6 +435,8 @@ class BBCode
 			return $text;
 		}

+		$data['url'] = Network::sanitizeUrl($data['url']);
+
 		if (isset($data['title'])) {
 			$data['title'] = strip_tags($data['title']);
 			$data['title'] = str_replace(['http://', 'https://'], '', $data['title']);
@ -485,6 +488,7 @@ class BBCode
 			}

 			if (!empty($data['provider_url']) && !empty($data['provider_name'])) {
+				$data['provider_url'] = Network::sanitizeUrl($data['provider_url']);
 				if (!empty($data['author_name'])) {
 					$return .= sprintf('<sup><a href="%s" target="_blank" rel="noopener noreferrer">%s (%s)</a></sup>', $data['provider_url'], $data['author_name'], $data['provider_name']);
 				} else {
@ -1064,6 +1068,21 @@ class BBCode
 		return $text;
 	}

+	/**
+	 * Callback: Sanitize links from given $match array
+	 *
+	 * @param array $match Array with link match
+	 * @return string BBCode
+	 */
+	private static function sanitizeLinksCallback(array $match): string
+	{
+		if (count($match) == 3) {
+			return '[' . $match[1] . ']' . Network::sanitizeUrl($match[2]) . '[/' . $match[1] . ']';
+		} else {
+			return '[' . $match[1] . '=' . Network::sanitizeUrl($match[2]) . ']' . $match[3] . '[/' . $match[1] . ']';
+		}
+	}
+
 	/**
 	 * Callback: Expands links from given $match array
 	 *
@ -1455,7 +1474,7 @@ class BBCode

 				// Replace non graphical smilies for external posts
 				if (!$nosmile) {
-					$text = self::performWithEscapedTags($text, ['img'], function ($text) use ($simple_html, $for_plaintext) {
+					$text = self::performWithEscapedTags($text, ['url', 'img', 'audio', 'video', 'youtube', 'vimeo', 'share', 'attachment', 'iframe', 'bookmark'], function ($text) use ($simple_html, $for_plaintext) {
 						return Smilies::replace($text, ($simple_html != self::INTERNAL) || $for_plaintext);
 					});
 				}
@ -1717,6 +1736,9 @@ class BBCode
 				// Simplify "video" element
 				$text = preg_replace('(\[video[^\]]*?\ssrc\s?=\s?([^\s\]]+)[^\]]*?\].*?\[/video\])ism', '[video]$1[/video]', $text);

+				$text = preg_replace_callback("/\[(video)\](.*?)\[\/video\]/ism", [self::class, 'sanitizeLinksCallback'], $text);
+				$text = preg_replace_callback("/\[(audio)\](.*?)\[\/audio\]/ism", [self::class, 'sanitizeLinksCallback'], $text);
+
 				if ($simple_html == self::NPF) {
 					$text = preg_replace(
 						"/\[video\](.*?)\[\/video\]/ism",
@ -1759,6 +1781,7 @@ class BBCode
 				}

 				// Backward compatibility, [iframe] support has been removed in version 2020.12
+				$text = preg_replace_callback("/\[(iframe)\](.*?)\[\/iframe\]/ism", [self::class, 'sanitizeLinksCallback'], $text);
 				$text = preg_replace("/\[iframe\](.*?)\[\/iframe\]/ism", '<a href="$1">$1</a>', $text);

 				$text = self::normalizeVideoLinks($text);
@ -1811,6 +1834,9 @@ class BBCode
 					$text = '<span style="font-size: xx-large; line-height: normal;">' . $text . '</span>';
 				}

+				$text = preg_replace_callback("/\[(url)\](.*?)\[\/url\]/ism", [self::class, 'sanitizeLinksCallback'], $text);
+				$text = preg_replace_callback("/\[(url)\=(.*?)\](.*?)\[\/url\]/ism", [self::class, 'sanitizeLinksCallback'], $text);
+
 				// Handle mentions and hashtag links
 				if ($simple_html == self::DIASPORA) {
 					// The ! is converted to @ since Diaspora only understands the @
@ -1913,11 +1939,11 @@ class BBCode
 				self::performWithEscapedTags($text, ['url', 'share'], function ($text) use ($simple_html) {
 					$text = preg_replace_callback("/(?:#\[url\=[^\[\]]*\]|\[url\=[^\[\]]*\]#)(.*?)\[\/url\]/ism", function ($matches) use ($simple_html) {
 						if ($simple_html == self::ACTIVITYPUB) {
-							return '<a href="' . DI::baseUrl() . '/search?tag=' . rawurlencode($matches[1])
+							return '<a href="' . DI::baseUrl() . '/search?tag=' . urlencode($matches[1])
 								. '" data-tag="' . XML::escape($matches[1]) . '" rel="tag ugc">#'
 								. XML::escape($matches[1]) . '</a>';
 						} else {
-							return '#<a href="' . DI::baseUrl() . '/search?tag=' . rawurlencode($matches[1])
+							return '#<a href="' . DI::baseUrl() . '/search?tag=' . urlencode($matches[1])
 								. '" class="tag" rel="tag" title="' . XML::escape($matches[1]) . '">'
 								. XML::escape($matches[1]) . '</a>';
 						}
@ -1944,6 +1970,7 @@ class BBCode
 				$text = preg_replace('/acct:([^@]+)@((?!\-)(?:[a-zA-Z\d\-]{0,62}[a-zA-Z\d]\.){1,126}(?!\d+)[a-zA-Z\d]{1,63})/', '<a href="' . DI::baseUrl() . '/acctlink?addr=$1@$2" target="extlink">acct:$1@$2</a>', $text);

 				// Perform MAIL Search
+				$text = preg_replace_callback("/\[(mail)\](.*?)\[\/mail\]/ism", [self::class, 'sanitizeLinksCallback'], $text);
 				$text = preg_replace("/\[mail\](.*?)\[\/mail\]/", '<a href="mailto:$1">$1</a>', $text);
 				$text = preg_replace("/\[mail\=(.*?)\](.*?)\[\/mail\]/", '<a href="mailto:$1">$2</a>', $text);

@ -2304,7 +2331,7 @@ class BBCode

 					case '#':
 					default:
-						return $match[1] . '[url=' . DI::baseUrl() . '/search?tag=' . $match[2] . ']' . $match[2] . '[/url]';
+						return $match[1] . '[url=' . DI::baseUrl() . '/search?tag=' . urlencode($match[2]) . ']' . $match[2] . '[/url]';
 				}
 			},
 			$body
--- a/src/Model/Item.php
+++ b/src/Model/Item.php
@ -2491,7 +2491,7 @@ class Item
 				}

 				$basetag = str_replace('_', ' ', substr($tag, 1));
-				$newtag = '#[url=' . DI::baseUrl() . '/search?tag=' . $basetag . ']' . $basetag . '[/url]';
+				$newtag = '#[url=' . DI::baseUrl() . '/search?tag=' . urlencode($basetag) . ']' . $basetag . '[/url]';

 				$body = str_replace($tag, $newtag, $body);
 			}
--- a/src/Model/Post/Link.php
+++ b/src/Model/Post/Link.php
@ -31,6 +31,7 @@ use Friendica\Util\HTTPSignature;
 use Friendica\Util\Images;
 use Friendica\Util\Proxy;
 use Friendica\Object\Image;
+use Friendica\Util\Network;

 /**
 * Class Link
@ -77,7 +78,7 @@ class Link
 		} else {
 			$fields = self::fetchMimeType($url);
 			$fields['uri-id'] = $uriId;
-			$fields['url'] = $url;
+			$fields['url'] = Network::sanitizeUrl($url);

 			DBA::insert('post-link', $fields, Database::INSERT_IGNORE);
 			$id = DBA::lastInsertId();
--- a/src/Model/Post/Media.php
+++ b/src/Model/Post/Media.php
@ -96,6 +96,7 @@ class Media
 			return false;
 		}

+		$media['url'] = Network::sanitizeUrl($media['url']);
 		$media = self::unsetEmptyFields($media);
 		$media = DI::dbaDefinition()->truncateFieldsForTable('post-media', $media);

--- a/src/Model/Tag.php
+++ b/src/Model/Tag.php
@ -558,7 +558,7 @@ class Tag
 		);
 		while ($tag = DBA::fetch($taglist)) {
 			if ($tag['url'] == '') {
-				$tag['url'] = $searchpath . rawurlencode($tag['name']);
+				$tag['url'] = $searchpath . urlencode($tag['name']);
 			}

 			$orig_tag = $tag['url'];
--- a/src/Module/Profile/Profile.php
+++ b/src/Module/Profile/Profile.php
@ -226,7 +226,7 @@ class Profile extends BaseProfile
 			// Separator is defined in Module\Settings\Profile\Index::cleanKeywords
 			foreach (explode(', ', $profile['pub_keywords']) as $tag_label) {
 				$tags[] = [
-					'url'   => '/search?tag=' . $tag_label,
+					'url'   => '/search?tag=' . urlencode($tag_label),
 					'label' => Tag::TAG_CHARACTER[Tag::HASHTAG] . $tag_label,
 				];
 			}
--- a/src/Util/Network.php
+++ b/src/Util/Network.php
@ -659,6 +659,29 @@ class Network
 		return !empty($scheme) && in_array($scheme, ['http', 'https']) && parse_url($url, PHP_URL_HOST);
 	}

+	/**
+	 * Remove invalid parts from an URL
+	 *
+	 * @param string $url
+	 * @return string sanitized URL
+	 */
+	public static function sanitizeUrl(string $url): string
+	{
+		$sanitized = $url = trim($url);
+
+		foreach (['"', ' '] as $character) {
+			$pos = strpos($sanitized, $character);
+			if ($pos !== false) {
+				$sanitized = trim(substr($sanitized, 0, $pos));
+			}
+		}
+
+		if ($sanitized != $url) {
+			Logger::debug('Link got sanitized', ['url' => $url, 'sanitzed' => $sanitized]);
+		}
+		return $sanitized;
+	}
+
 	/**
 	 * Creates an Uri object out of a given Uri string
 	 *
--- a/src/Util/ParseUrl.php
+++ b/src/Util/ParseUrl.php
@ -274,13 +274,13 @@ class ParseUrl
 						$siteinfo['author_name'] = trim($oembed_data->author_name);
 					}
 					if (!empty($oembed_data->author_url)) {
-						$siteinfo['author_url'] = trim($oembed_data->author_url);
+						$siteinfo['author_url'] = Network::sanitizeUrl($oembed_data->author_url);
 					}
 					if (!empty($oembed_data->provider_name)) {
 						$siteinfo['publisher_name'] = trim($oembed_data->provider_name);
 					}
 					if (!empty($oembed_data->provider_url)) {
-						$siteinfo['publisher_url'] = trim($oembed_data->provider_url);
+						$siteinfo['publisher_url'] = Network::sanitizeUrl($oembed_data->provider_url);
 					}
 					if (!empty($oembed_data->thumbnail_url)) {
 						$siteinfo['image'] = $oembed_data->thumbnail_url;
@ -884,7 +884,7 @@ class ParseUrl

 			$content = JsonLD::fetchElement($jsonld, 'publisher', 'url');
 			if (!empty($content) && is_string($content)) {
-				$jsonldinfo['publisher_url'] = trim($content);
+				$jsonldinfo['publisher_url'] = Network::sanitizeUrl($content);
 			}

 			$brand = JsonLD::fetchElement($jsonld, 'publisher', 'brand', '@type', 'Organization');
@ -896,7 +896,7 @@ class ParseUrl

 				$content = JsonLD::fetchElement($brand, 'url');
 				if (!empty($content) && is_string($content)) {
-					$jsonldinfo['publisher_url'] = trim($content);
+					$jsonldinfo['publisher_url'] = Network::sanitizeUrl($content);
 				}

 				$content = JsonLD::fetchElement($brand, 'logo', 'url');
@ -924,12 +924,12 @@ class ParseUrl

 			$content = JsonLD::fetchElement($jsonld, 'author', 'sameAs');
 			if (!empty($content) && is_string($content)) {
-				$jsonldinfo['author_url'] = trim($content);
+				$jsonldinfo['author_url'] = Network::sanitizeUrl($content);
 			}

 			$content = JsonLD::fetchElement($jsonld, 'author', 'url');
 			if (!empty($content) && is_string($content)) {
-				$jsonldinfo['author_url'] = trim($content);
+				$jsonldinfo['author_url'] = Network::sanitizeUrl($content);
 			}

 			$logo = JsonLD::fetchElement($jsonld, 'author', 'logo');
@ -1084,7 +1084,7 @@ class ParseUrl

 		$content = JsonLD::fetchElement($jsonld, 'url');
 		if (!empty($content) && is_string($content)) {
-			$jsonldinfo['publisher_url'] = trim($content);
+			$jsonldinfo['publisher_url'] = Network::sanitizeUrl($content);
 		}

 		$content = JsonLD::fetchElement($jsonld, 'thumbnailUrl');
@ -1123,7 +1123,7 @@ class ParseUrl

 		$content = JsonLD::fetchElement($jsonld, 'url');
 		if (!empty($content) && is_string($content)) {
-			$jsonldinfo['publisher_url'] = trim($content);
+			$jsonldinfo['publisher_url'] = Network::sanitizeUrl($content);
 		}

 		$content = JsonLD::fetchElement($jsonld, 'logo', 'url', '@type', 'ImageObject');
@ -1140,7 +1140,7 @@ class ParseUrl

 		$content = JsonLD::fetchElement($jsonld, 'brand', 'url', '@type', 'Organization');
 		if (!empty($content) && is_string($content)) {
-			$jsonldinfo['publisher_url'] = trim($content);
+			$jsonldinfo['publisher_url'] = Network::sanitizeUrl($content);
 		}

 		Logger::info('Fetched Organization information', ['url' => $siteinfo['url'], 'fetched' => $jsonldinfo]);
@ -1172,12 +1172,12 @@ class ParseUrl

 		$content = JsonLD::fetchElement($jsonld, 'sameAs');
 		if (!empty($content) && is_string($content)) {
-			$jsonldinfo['author_url'] = trim($content);
+			$jsonldinfo['author_url'] = Network::sanitizeUrl($content);
 		}

 		$content = JsonLD::fetchElement($jsonld, 'url');
 		if (!empty($content) && is_string($content)) {
-			$jsonldinfo['author_url'] = trim($content);
+			$jsonldinfo['author_url'] = Network::sanitizeUrl($content);
 		}

 		$content = JsonLD::fetchElement($jsonld, 'image', 'url', '@type', 'ImageObject');