Merge pull request #13642 from annando/pre-storage-match

Check for user defined channel matches before storing them
This commit is contained in:
Hypolite Petovan 2023-11-15 09:57:40 -08:00 committed by GitHub
commit fc0703005d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 237 additions and 42 deletions

View File

@ -1,6 +1,6 @@
-- ------------------------------------------
-- Friendica 2023.09-rc (Giant Rhubarb)
-- DB_UPDATE_VERSION 1539
-- DB_UPDATE_VERSION 1540
-- ------------------------------------------
@ -1867,6 +1867,16 @@ CREATE TABLE IF NOT EXISTS `subscription` (
FOREIGN KEY (`uid`) REFERENCES `user` (`uid`) ON UPDATE RESTRICT ON DELETE CASCADE
) DEFAULT COLLATE utf8mb4_general_ci COMMENT='Push Subscription for the API';
--
-- TABLE check-full-text-search
--
CREATE TABLE IF NOT EXISTS `check-full-text-search` (
`pid` int unsigned NOT NULL COMMENT 'The ID of the process',
`searchtext` mediumtext COMMENT 'Simplified text for the full text search',
PRIMARY KEY(`pid`),
FULLTEXT INDEX `searchtext` (`searchtext`)
) DEFAULT COLLATE utf8mb4_general_ci COMMENT='Check for a full text search match in user defined channels before storing the message in the system';
--
-- TABLE userd
--

View File

@ -68,6 +68,7 @@ Additionally to the search for content, there are additional keywords that can b
* network:dscs - Posts that are received by the Discourse connector.
* network:tmbl - Posts that are received by the Tumblr connector.
* network:bsky - Posts that are received by the Bluesky connector.
* platform - Use this to include or exclude some platforms from your channel, e.g. "+platform:friendica".
* visibility - You have the choice between different visibilities. You can only see unlisted or private posts that you have the access for.
* visibility:public
* visibility:unlisted

View File

@ -18,6 +18,7 @@ Database Tables
| [attach](help/database/db_attach) | file attachments |
| [cache](help/database/db_cache) | Stores temporary data |
| [channel](help/database/db_channel) | User defined Channels |
| [check-full-text-search](help/database/db_check-full-text-search) | Check for a full text search match in user defined channels before storing the message in the system |
| [config](help/database/db_config) | main configuration storage |
| [contact](help/database/db_contact) | contact table |
| [contact-relation](help/database/db_contact-relation) | Contact relations |

View File

@ -0,0 +1,23 @@
Table check-full-text-search
===========
Check for a full text search match in user defined channels before storing the message in the system
Fields
------
| Field | Description | Type | Null | Key | Default | Extra |
| ---------- | ---------------------------------------- | ------------ | ---- | --- | ------- | ----- |
| pid | Process id of the worker | int unsigned | NO | | 0 | |
| searchtext | Simplified text for the full text search | mediumtext | YES | | NULL | |
Indexes
------------
| Name | Fields |
| ---------- | -------------------- |
| PRIMARY | pid |
| searchtext | FULLTEXT, searchtext |
Return to [database documentation](help/database)

View File

@ -25,16 +25,23 @@ use Friendica\BaseCollection;
use Friendica\Content\Conversation\Collection\UserDefinedChannels;
use Friendica\Content\Conversation\Entity;
use Friendica\Content\Conversation\Factory;
use Friendica\Core\PConfig\Capability\IManagePersonalConfigValues;
use Friendica\Database\Database;
use Friendica\Model\User;
use Psr\Log\LoggerInterface;
class UserDefinedChannel extends \Friendica\BaseRepository
{
protected static $table_name = 'channel';
public function __construct(Database $database, LoggerInterface $logger, Factory\UserDefinedChannel $factory)
/** @var IManagePersonalConfigValues */
private $pConfig;
public function __construct(Database $database, LoggerInterface $logger, Factory\UserDefinedChannel $factory, IManagePersonalConfigValues $pConfig)
{
parent::__construct($database, $logger, $factory);
$this->pConfig = $pConfig;
}
/**
@ -89,7 +96,7 @@ class UserDefinedChannel extends \Friendica\BaseRepository
*/
public function deleteById(int $id, int $uid): bool
{
return $this->db->delete('channel', ['id' => $id, 'uid' => $uid]);
return $this->db->delete(self::$table_name, ['id' => $id, 'uid' => $uid]);
}
/**
@ -130,4 +137,41 @@ class UserDefinedChannel extends \Friendica\BaseRepository
return $Channel;
}
/**
* Checks, if one of the user defined channels matches with the given search text
* @todo To increase the performance, this functionality should be replaced with a single SQL call.
*
* @param string $searchtext
* @param string $language
* @return boolean
*/
public function match(string $searchtext, string $language): bool
{
if (!in_array($language, User::getLanguages())) {
$this->logger->debug('Unwanted language found. No matched channel found.', ['language' => $language, 'searchtext' => $searchtext]);
return false;
}
$store = false;
$this->db->insert('check-full-text-search', ['pid' => getmypid(), 'searchtext' => $searchtext], Database::INSERT_UPDATE);
$channels = $this->db->select(self::$table_name, ['full-text-search', 'uid', 'label'], ["`full-text-search` != ?", '']);
while ($channel = $this->db->fetch($channels)) {
$channelsearchtext = $channel['full-text-search'];
foreach (['from', 'to', 'group', 'tag', 'network', 'platform', 'visibility'] as $keyword) {
$channelsearchtext = preg_replace('~(' . $keyword . ':.[\w@\.-]+)~', '"$1"', $channelsearchtext);
}
if ($this->db->exists('check-full-text-search', ["`pid` = ? AND MATCH (`searchtext`) AGAINST (? IN BOOLEAN MODE)", getmypid(), $channelsearchtext])) {
if (in_array($language, $this->pConfig->get($channel['uid'], 'channel', 'languages', [User::getLanguageCode($channel['uid'])]))) {
$store = true;
$this->logger->debug('Matching channel found.', ['uid' => $channel['uid'], 'label' => $channel['label'], 'language' => $language, 'channelsearchtext' => $channelsearchtext, 'searchtext' => $searchtext]);
break;
}
}
}
$this->db->close($channels);
$this->db->delete('check-full-text-search', ['pid' => getmypid()]);
return $store;
}
}

View File

@ -240,7 +240,7 @@ class GServer
} elseif (!empty($contact['baseurl'])) {
$server = $contact['baseurl'];
} elseif ($contact['network'] == Protocol::DIASPORA) {
$parts = parse_url($contact['url']);
$parts = (array)parse_url($contact['url']);
unset($parts['path']);
$server = (string)Uri::fromParts($parts);
} else {
@ -589,7 +589,7 @@ class GServer
if ((parse_url($url, PHP_URL_HOST) != parse_url($valid_url, PHP_URL_HOST)) && (parse_url($url, PHP_URL_PATH) != parse_url($valid_url, PHP_URL_PATH)) &&
(parse_url($url, PHP_URL_PATH) == '')) {
Logger::debug('Found redirect. Mark old entry as failure and redirect to the basepath.', ['old' => $url, 'new' => $valid_url]);
$parts = parse_url($valid_url);
$parts = (array)parse_url($valid_url);
unset($parts['path']);
$valid_url = (string)Uri::fromParts($parts);

View File

@ -33,6 +33,7 @@ use Friendica\Model\Post;
use Friendica\Model\Tag;
use Friendica\Model\Verb;
use Friendica\Protocol\Activity;
use Friendica\Protocol\ActivityPub\Receiver;
use Friendica\Protocol\Relay;
use Friendica\Util\DateTimeFormat;
@ -52,7 +53,7 @@ class Engagement
}
$parent = Post::selectFirst(['uri-id', 'created', 'author-id', 'owner-id', 'uid', 'private', 'contact-contact-type', 'language', 'network',
'title', 'content-warning', 'body', 'author-contact-type', 'author-nick', 'author-addr', 'owner-contact-type', 'owner-nick', 'owner-addr'],
'title', 'content-warning', 'body', 'author-contact-type', 'author-nick', 'author-addr', 'author-gsid', 'owner-contact-type', 'owner-nick', 'owner-addr'],
['uri-id' => $item['parent-uri-id']]);
if ($parent['created'] < self::getCreationDateLimit(false)) {
@ -79,7 +80,14 @@ class Engagement
$mediatype = self::getMediaType($item['parent-uri-id']);
if (!$store) {
$mediatype = !empty($mediatype);
$store = !empty($mediatype);
}
$searchtext = self::getSearchTextForItem($parent);
if (!$store) {
$content = trim(($parent['title'] ?? '') . ' ' . ($parent['content-warning'] ?? '') . ' ' . ($parent['body'] ?? ''));
$language = array_key_first(Item::getLanguageArray($content, 1, 0, $parent['author-id']));
$store = DI::userDefinedChannel()->match($searchtext, $language);
}
$engagement = [
@ -88,7 +96,7 @@ class Engagement
'contact-type' => $parent['contact-contact-type'],
'media-type' => $mediatype,
'language' => $parent['language'],
'searchtext' => self::getSearchText($parent),
'searchtext' => $searchtext,
'created' => $parent['created'],
'restricted' => !in_array($item['network'], Protocol::FEDERATED) || ($parent['private'] != Item::PUBLIC),
'comments' => DBA::count('post', ['parent-uri-id' => $item['parent-uri-id'], 'gravity' => Item::GRAVITY_COMMENT]),
@ -106,10 +114,56 @@ class Engagement
Logger::debug('Engagement stored', ['fields' => $engagement, 'ret' => $ret]);
}
private static function getSearchText(array $item): string
public static function getSearchTextForActivity(string $content, int $author_id, array $tags, array $receivers): string
{
$author = Contact::getById($author_id);
$item = [
'uri-id' => 0,
'network' => Protocol::ACTIVITYPUB,
'title' => '',
'content-warning' => '',
'body' => $content,
'private' => Item::PRIVATE,
'author-id' => $author_id,
'author-contact-type' => $author['contact-type'],
'author-nick' => $author['nick'],
'author-addr' => $author['addr'],
'author-gsid' => $author['gsid'],
'owner-id' => $author_id,
'owner-contact-type' => $author['contact-type'],
'owner-nick' => $author['nick'],
'owner-addr' => $author['addr'],
];
foreach ($receivers as $receiver) {
if ($receiver == Receiver::PUBLIC_COLLECTION) {
$item['private'] = Item::PUBLIC;
}
}
return self::getSearchText($item, $receivers, $tags);
}
private static function getSearchTextForItem(array $item): string
{
$receivers = array_column(Tag::getByURIId($item['uri-id'], [Tag::MENTION, Tag::IMPLICIT_MENTION, Tag::EXCLUSIVE_MENTION, Tag::AUDIENCE]), 'url');
$tags = array_column(Tag::getByURIId($item['uri-id'], [Tag::HASHTAG]), 'name');
return self::getSearchText($item, $receivers, $tags);
}
private static function getSearchText(array $item, array $receivers, array $tags): string
{
$body = '[nosmile]network:' . $item['network'];
if (!empty($item['author-gsid'])) {
$gserver = DBA::selectFirst('gserver', ['platform'], ['id' => $item['author-gsid']]);
$platform = preg_replace( '/[\W]/', '', $gserver['platform'] ?? '');
if (!empty($platform)) {
$body .= ' platform:' . $platform;
}
}
switch ($item['private']) {
case Item::PUBLIC:
$body .= ' visibility:public';
@ -136,8 +190,8 @@ class Engagement
}
}
foreach (Tag::getByURIId($item['uri-id'], [Tag::MENTION, Tag::IMPLICIT_MENTION, Tag::EXCLUSIVE_MENTION, Tag::AUDIENCE]) as $tag) {
$contact = Contact::getByURL($tag['name'], false, ['nick', 'addr', 'contact-type']);
foreach ($receivers as $receiver) {
$contact = Contact::getByURL($receiver, false, ['nick', 'addr', 'contact-type']);
if (empty($contact)) {
continue;
}
@ -149,8 +203,8 @@ class Engagement
}
}
foreach (Tag::getByURIId($item['uri-id'], [Tag::HASHTAG]) as $tag) {
$body .= ' tag:' . $tag['name'];
foreach ($tags as $tag) {
$body .= ' tag:' . $tag;
}
$body .= ' ' . $item['title'] . ' ' . $item['content-warning'] . ' ' . $item['body'];

View File

@ -22,6 +22,8 @@
namespace Friendica\Model;
use Friendica\Database\DBA;
use Friendica\DI;
use Friendica\Util\DateTimeFormat;
/**
* Model for DB specific logic for the search entity
@ -36,14 +38,43 @@ class Search
*/
public static function getUserTags(): array
{
$termsStmt = DBA::p("SELECT DISTINCT(`term`) FROM `search`");
$user_condition = ["`verified` AND NOT `blocked` AND NOT `account_removed` AND NOT `account_expired` AND `user`.`uid` > ?", 0];
$abandon_days = intval(DI::config()->get('system', 'account_abandon_days'));
if (!empty($abandon_days)) {
$user_condition = DBA::mergeConditions($user_condition, ["`last-activity` > ?", DateTimeFormat::utc('now - ' . $abandon_days . ' days')]);
}
$condition = $user_condition;
$condition[0] = "SELECT DISTINCT(`term`) FROM `search` INNER JOIN `user` ON `search`.`uid` = `user`.`uid` WHERE " . $user_condition[0];
$sql = array_shift($condition);
$termsStmt = DBA::p($sql, $condition);
$tags = [];
while ($term = DBA::fetch($termsStmt)) {
$tags[] = trim(mb_strtolower($term['term']), '#');
}
DBA::close($termsStmt);
$condition = $user_condition;
$condition[0] = "SELECT `include-tags` FROM `channel` INNER JOIN `user` ON `channel`.`uid` = `user`.`uid` WHERE " . $user_condition[0];
$sql = array_shift($condition);
$channels = DBA::p($sql, $condition);
while ($channel = DBA::fetch($channels)) {
foreach (explode(',', $channel['include-tags']) as $tag) {
$tag = trim(mb_strtolower($tag));
if (empty($tag)) {
continue;
}
if (!in_array($tag, $tags)) {
$tags[] = $tag;
}
}
}
DBA::close($channels);
sort($tags);
return $tags;
}
}

View File

@ -582,6 +582,12 @@ class User
*/
public static function getLanguages(): array
{
$cachekey = 'user:getLanguages';
$languages = DI::cache()->get($cachekey);
if (!is_null($languages)) {
return $languages;
}
$supported = array_keys(DI::l10n()->getLanguageCodes());
$languages = [];
$uids = [];
@ -620,7 +626,10 @@ class User
DBA::close($channels);
ksort($languages);
return array_keys($languages);
$languages = array_keys($languages);
DI::cache()->set($cachekey, $languages);
return $languages;
}
/**

View File

@ -391,7 +391,7 @@ class Timeline extends BaseModule
if (!empty($channel->fullTextSearch)) {
$search = $channel->fullTextSearch;
foreach (['from', 'to', 'group', 'tag', 'network', 'visibility'] as $keyword) {
foreach (['from', 'to', 'group', 'tag', 'network', 'platform', 'visibility'] as $keyword) {
$search = preg_replace('~(' . $keyword . ':.[\w@\.-]+)~', '"$1"', $search);
}
$condition = DBA::mergeConditions($condition, ["MATCH (`searchtext`) AGAINST (? IN BOOLEAN MODE)", $search]);

View File

@ -42,6 +42,7 @@ use Friendica\Model\Mail;
use Friendica\Model\Tag;
use Friendica\Model\User;
use Friendica\Model\Post;
use Friendica\Model\Post\Engagement;
use Friendica\Protocol\Activity;
use Friendica\Protocol\ActivityPub;
use Friendica\Protocol\Delivery;
@ -751,7 +752,7 @@ class Processor
public static function addToFeaturedCollection(array $activity)
{
$post = self::getUriIdForFeaturedCollection($activity);
if (empty($post)) {
if (empty($post) || empty($post['author-id'])) {
Queue::remove($activity);
return;
}
@ -1562,20 +1563,17 @@ class Processor
return '';
}
$ldobject = JsonLD::compact($object);
$signer = [];
if (!empty($object['attributedTo'])) {
$attributed_to = $object['attributedTo'];
if (is_array($attributed_to)) {
$compacted = JsonLD::compact($object);
$attributed_to = JsonLD::fetchElement($compacted, 'as:attributedTo', '@id');
}
$attributed_to = JsonLD::fetchElement($ldobject, 'as:attributedTo', '@id');
if (!empty($attributed_to)) {
$signer[] = $attributed_to;
}
if (!empty($object['actor'])) {
$object_actor = $object['actor'];
} elseif (!empty($attributed_to)) {
$object_actor = JsonLD::fetchElement($ldobject, 'as:actor', '@id');
if (!empty($attributed_to)) {
$object_actor = $attributed_to;
} else {
// Shouldn't happen
@ -1591,8 +1589,6 @@ class Processor
$actor = $object_actor;
}
$ldobject = JsonLD::compact($object);
$type = JsonLD::fetchElement($ldobject, '@type');
$object_id = JsonLD::fetchElement($ldobject, 'as:object', '@id');
@ -1607,10 +1603,11 @@ class Processor
}
$activity = $object;
$ldactivity = $ldobject;
} else {
} elseif (!empty($object['id'])) {
$activity = self::getActivityForObject($object, $actor);
$ldactivity = JsonLD::compact($activity);
$object_id = $object['id'];
} else {
return null;
}
$ldactivity['recursion-depth'] = !empty($child['recursion-depth']) ? $child['recursion-depth'] + 1 : 0;
@ -1631,7 +1628,7 @@ class Processor
if ($completion == Receiver::COMPLETION_RELAY) {
$ldactivity['from-relay'] = $ldactivity['thread-completion'];
if (in_array($type, Receiver::CONTENT_TYPES) && !self::acceptIncomingMessage($ldactivity, $object_id)) {
if (in_array($type, Receiver::CONTENT_TYPES) && !self::acceptIncomingMessage($ldactivity)) {
return null;
}
}
@ -1684,16 +1681,18 @@ class Processor
* Test if incoming relay messages should be accepted
*
* @param array $activity activity array
* @param string $id object ID
* @return boolean true if message is accepted
*/
private static function acceptIncomingMessage(array $activity, string $id): bool
private static function acceptIncomingMessage(array $activity): bool
{
if (empty($activity['as:object'])) {
$id = JsonLD::fetchElement($activity, '@id');
Logger::info('No object field in activity - accepted', ['id' => $id]);
return true;
}
$id = JsonLD::fetchElement($activity, 'as:object', '@id');
$replyto = JsonLD::fetchElement($activity['as:object'], 'as:inReplyTo', '@id');
$uriid = ItemURI::getIdByURI($replyto ?? '');
if (Post::exists(['uri-id' => $uriid])) {
@ -1731,7 +1730,23 @@ class Processor
$languages = self::getPostLanguages($activity['as:object'] ?? '');
return Relay::isSolicitedPost($messageTags, $content, $authorid, $id, Protocol::ACTIVITYPUB, $activity['thread-completion'] ?? 0, $languages);
$wanted = Relay::isSolicitedPost($messageTags, $content, $authorid, $id, Protocol::ACTIVITYPUB, $activity['from-relay'], $languages);
if ($wanted) {
return true;
}
$receivers = [];
foreach (['as:to', 'as:cc', 'as:bto', 'as:bcc', 'as:audience'] as $element) {
$receiver_list = JsonLD::fetchElementArray($activity, $element, '@id');
if (empty($receiver_list)) {
continue;
}
$receivers = array_merge($receivers, $receiver_list);
}
$searchtext = Engagement::getSearchTextForActivity($content, $authorid, $messageTags, $receivers);
$language = array_key_first(Item::getLanguageArray($content, 1, 0, $authorid));
return DI::userDefinedChannel()->match($searchtext, $language);
}
/**

View File

@ -193,12 +193,7 @@ class Relay
}
if (!empty($languages) || !empty($detected)) {
$cachekey = 'relay:isWantedLanguage';
$user_languages = DI::cache()->get($cachekey);
if (is_null($user_languages)) {
$user_languages = User::getLanguages();
DI::cache()->set($cachekey, $user_languages);
}
$user_languages = User::getLanguages();
foreach ($detected as $language) {
if (in_array($language, $user_languages)) {

View File

@ -46,6 +46,7 @@ class OptimizeTables
DBA::optimizeTable('parsed_url');
DBA::optimizeTable('session');
DBA::optimizeTable('post-engagement');
DBA::optimizeTable('check-full-text-search');
if (DI::config()->get('system', 'optimize_all_tables')) {
DBA::optimizeTable('apcontact');

View File

@ -56,7 +56,7 @@ use Friendica\Database\DBA;
// This file is required several times during the test in DbaDefinition which justifies this condition
if (!defined('DB_UPDATE_VERSION')) {
define('DB_UPDATE_VERSION', 1539);
define('DB_UPDATE_VERSION', 1540);
}
return [
@ -1858,6 +1858,17 @@ return [
"uid_application-id" => ["uid", "application-id"],
]
],
"check-full-text-search" => [
"comment" => "Check for a full text search match in user defined channels before storing the message in the system",
"fields" => [
"pid" => ["type" => "int unsigned", "not null" => "1", "primary" => "1", "comment" => "The ID of the process"],
"searchtext" => ["type" => "mediumtext", "comment" => "Simplified text for the full text search"],
],
"indexes" => [
"PRIMARY" => ["pid"],
"searchtext" => ["FULLTEXT", "searchtext"],
],
],
"userd" => [
"comment" => "Deleted usernames",
"fields" => [