2015-09-27 08:02:05 -04:00
< ? php
2017-12-13 02:03:42 -05:00
/**
2022-01-02 02:27:47 -05:00
* @ copyright Copyright ( C ) 2010 - 2022 , the Friendica project
2020-02-09 10:18:46 -05:00
*
* @ license GNU AGPL version 3 or any later version
*
* This program is free software : you can redistribute it and / or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation , either version 3 of the
* License , or ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU Affero General Public License for more details .
*
* You should have received a copy of the GNU Affero General Public License
* along with this program . If not , see < https :// www . gnu . org / licenses />.
2017-12-13 02:03:42 -05:00
*
*/
2020-02-09 10:18:46 -05:00
2017-12-13 02:03:42 -05:00
namespace Friendica\Protocol ;
2018-07-19 22:15:21 -04:00
use DOMDocument ;
use DOMXPath ;
2020-07-14 10:15:04 -04:00
use Friendica\Content\PageInfo ;
2020-07-17 00:40:20 -04:00
use Friendica\Content\Text\BBCode ;
2018-07-19 22:15:21 -04:00
use Friendica\Content\Text\HTML ;
2021-10-23 04:49:27 -04:00
use Friendica\Core\Cache\Enum\Duration ;
2018-10-29 17:20:46 -04:00
use Friendica\Core\Logger ;
2018-08-11 16:40:44 -04:00
use Friendica\Core\Protocol ;
2018-07-21 08:40:21 -04:00
use Friendica\Database\DBA ;
2019-12-15 18:47:24 -05:00
use Friendica\DI ;
2020-07-17 00:40:20 -04:00
use Friendica\Model\Contact ;
2018-01-20 18:52:54 -05:00
use Friendica\Model\Item ;
2020-10-31 09:26:08 -04:00
use Friendica\Model\Post ;
2020-04-17 02:35:20 -04:00
use Friendica\Model\Tag ;
2020-07-17 00:40:20 -04:00
use Friendica\Model\User ;
use Friendica\Util\DateTimeFormat ;
2018-01-27 08:25:54 -05:00
use Friendica\Util\Network ;
2020-01-03 09:09:03 -05:00
use Friendica\Util\ParseUrl ;
2021-06-30 14:44:41 -04:00
use Friendica\Util\Proxy ;
2020-07-17 00:40:20 -04:00
use Friendica\Util\Strings ;
2018-07-08 09:39:48 -04:00
use Friendica\Util\XML ;
2021-08-29 07:37:08 -04:00
use GuzzleHttp\Exception\TransferException ;
2018-03-08 14:58:35 -05:00
2016-02-16 02:06:55 -05:00
/**
2020-02-09 10:18:46 -05:00
* This class contain functions to import feeds ( RSS / RDF / Atom )
2016-02-16 02:06:55 -05:00
*/
2020-06-17 04:54:44 -04:00
class Feed
{
2017-12-13 02:03:42 -05:00
/**
2020-01-19 01:05:23 -05:00
* Read a RSS / RDF / Atom feed and create an item entry for it
2017-12-13 02:03:42 -05:00
*
2019-01-06 16:06:53 -05:00
* @ param string $xml The feed data
* @ param array $importer The user record of the importer
* @ param array $contact The contact record of the feed
2017-12-13 02:03:42 -05:00
*
2020-01-03 09:26:28 -05:00
* @ return array Returns the header and the first item in dry run mode
2019-01-06 16:06:53 -05:00
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
2017-12-13 02:03:42 -05:00
*/
2022-06-19 18:58:52 -04:00
public static function import ( string $xml , array $importer = [], array $contact = []) : array
2020-01-03 09:09:03 -05:00
{
2020-01-03 09:26:28 -05:00
$dryRun = empty ( $importer ) && empty ( $contact );
2017-12-13 02:03:42 -05:00
2020-01-03 09:26:28 -05:00
if ( $dryRun ) {
Logger :: info ( " Test Atom/RSS feed " );
2017-12-13 02:03:42 -05:00
} else {
2020-01-03 09:26:28 -05:00
Logger :: info ( " Import Atom/RSS feed ' " . $contact [ " name " ] . " ' (Contact " . $contact [ " id " ] . " ) for user " . $importer [ " uid " ]);
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2021-10-03 19:32:20 -04:00
$xml = trim ( $xml );
2018-07-10 08:27:56 -04:00
if ( empty ( $xml )) {
2020-01-03 09:26:28 -05:00
Logger :: info ( 'XML is empty.' );
return [];
2017-12-13 02:03:42 -05:00
}
2015-09-27 08:02:05 -04:00
2017-12-13 02:03:42 -05:00
if ( ! empty ( $contact [ 'poll' ])) {
$basepath = $contact [ 'poll' ];
} elseif ( ! empty ( $contact [ 'url' ])) {
$basepath = $contact [ 'url' ];
} else {
$basepath = '' ;
}
2015-09-27 08:02:05 -04:00
2017-12-13 02:03:42 -05:00
$doc = new DOMDocument ();
2021-10-03 19:32:20 -04:00
@ $doc -> loadXML ( $xml );
2017-12-17 15:24:57 -05:00
$xpath = new DOMXPath ( $doc );
2019-10-24 18:32:35 -04:00
$xpath -> registerNamespace ( 'atom' , ActivityNamespace :: ATOM1 );
2022-06-19 18:58:52 -04:00
$xpath -> registerNamespace ( 'dc' , 'http://purl.org/dc/elements/1.1/' );
$xpath -> registerNamespace ( 'content' , 'http://purl.org/rss/1.0/modules/content/' );
$xpath -> registerNamespace ( 'rdf' , 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' );
$xpath -> registerNamespace ( 'rss' , 'http://purl.org/rss/1.0/' );
$xpath -> registerNamespace ( 'media' , 'http://search.yahoo.com/mrss/' );
2019-10-24 18:32:35 -04:00
$xpath -> registerNamespace ( 'poco' , ActivityNamespace :: POCO );
2015-09-27 08:02:05 -04:00
2018-01-15 08:05:12 -05:00
$author = [];
2018-02-13 23:58:46 -05:00
$entries = null ;
2017-10-16 16:31:13 -04:00
2017-12-13 02:03:42 -05:00
// Is it RDF?
if ( $xpath -> query ( '/rdf:RDF/rss:channel' ) -> length > 0 ) {
2022-06-19 18:58:52 -04:00
$author [ 'author-link' ] = XML :: getFirstNodeValue ( $xpath , '/rdf:RDF/rss:channel/rss:link/text()' );
$author [ 'author-name' ] = XML :: getFirstNodeValue ( $xpath , '/rdf:RDF/rss:channel/rss:title/text()' );
2015-09-27 08:02:05 -04:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-name' ])) {
$author [ 'author-name' ] = XML :: getFirstNodeValue ( $xpath , '/rdf:RDF/rss:channel/rss:description/text()' );
2017-03-15 02:00:22 -04:00
}
2017-12-13 02:03:42 -05:00
$entries = $xpath -> query ( '/rdf:RDF/rss:item' );
2017-03-15 02:00:22 -04:00
}
2016-02-14 09:02:59 -05:00
2017-12-13 02:03:42 -05:00
// Is it Atom?
if ( $xpath -> query ( '/atom:feed' ) -> length > 0 ) {
2018-07-10 08:27:56 -04:00
$alternate = XML :: getFirstAttributes ( $xpath , " atom:link[@rel='alternate'] " );
2017-12-13 02:03:42 -05:00
if ( is_object ( $alternate )) {
2021-10-03 06:34:41 -04:00
foreach ( $alternate as $attribute ) {
2022-06-19 18:58:52 -04:00
if ( $attribute -> name == 'href' ) {
$author [ 'author-link' ] = $attribute -> textContent ;
2017-03-15 02:00:22 -04:00
}
}
}
2016-02-14 09:02:59 -05:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-link' ])) {
2018-07-10 08:27:56 -04:00
$self = XML :: getFirstAttributes ( $xpath , " atom:link[@rel='self'] " );
2017-12-13 02:03:42 -05:00
if ( is_object ( $self )) {
2021-10-03 06:34:41 -04:00
foreach ( $self as $attribute ) {
2022-06-19 18:58:52 -04:00
if ( $attribute -> name == 'href' ) {
$author [ 'author-link' ] = $attribute -> textContent ;
2017-12-13 02:03:42 -05:00
}
}
}
}
2015-09-27 08:02:05 -04:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-link' ])) {
$author [ 'author-link' ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:id/text()' );
2017-12-13 02:03:42 -05:00
}
2022-06-19 18:58:52 -04:00
$author [ 'author-avatar' ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:logo/text()' );
2015-09-27 08:02:05 -04:00
2022-06-19 18:58:52 -04:00
$author [ 'author-name' ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:title/text()' );
2016-07-08 16:31:11 -04:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-name' ])) {
$author [ 'author-name' ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:subtitle/text()' );
2017-03-15 02:00:22 -04:00
}
2020-01-03 09:09:03 -05:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-name' ])) {
$author [ 'author-name' ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:author/atom:name/text()' );
2017-03-15 02:00:22 -04:00
}
2020-01-03 09:09:03 -05:00
2018-07-10 08:27:56 -04:00
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/poco:displayName/text()' );
2022-06-19 18:58:52 -04:00
if ( $value != '' ) {
$author [ 'author-name' ] = $value ;
2017-03-15 02:00:22 -04:00
}
2020-01-03 09:09:03 -05:00
2020-01-03 09:26:28 -05:00
if ( $dryRun ) {
2022-06-19 18:58:52 -04:00
$author [ 'author-id' ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:author/atom:id/text()' );
2018-11-10 08:24:10 -05:00
// See https://tools.ietf.org/html/rfc4287#section-3.2.2
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/atom:uri/text()' );
2022-06-19 18:58:52 -04:00
if ( $value != '' ) {
$author [ 'author-link' ] = $value ;
2018-11-10 08:24:10 -05:00
}
2017-12-13 02:03:42 -05:00
2018-07-10 08:27:56 -04:00
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/poco:preferredUsername/text()' );
2022-06-19 18:58:52 -04:00
if ( $value != '' ) {
$author [ 'author-nick' ] = $value ;
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2018-07-08 07:46:05 -04:00
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/poco:address/poco:formatted/text()' );
2022-06-19 18:58:52 -04:00
if ( $value != '' ) {
$author [ 'author-location' ] = $value ;
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2018-07-08 07:46:05 -04:00
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/poco:note/text()' );
2022-06-19 18:58:52 -04:00
if ( $value != '' ) {
$author [ 'author-about' ] = $value ;
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2018-07-10 08:27:56 -04:00
$avatar = XML :: getFirstAttributes ( $xpath , " atom:author/atom:link[@rel='avatar'] " );
2017-12-13 02:03:42 -05:00
if ( is_object ( $avatar )) {
2021-10-03 06:34:41 -04:00
foreach ( $avatar as $attribute ) {
2022-06-19 18:58:52 -04:00
if ( $attribute -> name == 'href' ) {
$author [ 'author-avatar' ] = $attribute -> textContent ;
2017-12-13 02:03:42 -05:00
}
2017-08-21 16:21:04 -04:00
}
}
}
2016-02-14 13:50:59 -05:00
2022-06-19 18:58:52 -04:00
$author [ 'edited' ] = $author [ 'created' ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:updated/text()' );
2015-09-27 08:02:05 -04:00
2022-06-19 18:58:52 -04:00
$author [ 'app' ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:generator/text()' );
2015-09-27 08:02:05 -04:00
2017-12-13 02:03:42 -05:00
$entries = $xpath -> query ( '/atom:feed/atom:entry' );
}
2015-09-27 08:02:05 -04:00
2017-12-13 02:03:42 -05:00
// Is it RSS?
if ( $xpath -> query ( '/rss/channel' ) -> length > 0 ) {
2022-06-19 18:58:52 -04:00
$author [ 'author-link' ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/link/text()' );
2016-02-14 09:02:59 -05:00
2022-06-19 18:58:52 -04:00
$author [ 'author-name' ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/title/text()' );
2015-09-27 08:02:05 -04:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-name' ])) {
$author [ 'author-name' ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/copyright/text()' );
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-name' ])) {
$author [ 'author-name' ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/description/text()' );
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2022-06-19 18:58:52 -04:00
$author [ 'author-avatar' ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/image/url/text()' );
2022-03-13 11:29:07 -04:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-avatar' ])) {
$avatar = XML :: getFirstAttributes ( $xpath , '/rss/channel/itunes:image' );
2022-03-13 11:29:07 -04:00
if ( is_object ( $avatar )) {
foreach ( $avatar as $attribute ) {
2022-06-19 18:58:52 -04:00
if ( $attribute -> name == 'href' ) {
$author [ 'author-avatar' ] = $attribute -> textContent ;
2022-03-13 11:29:07 -04:00
}
}
}
}
2022-06-19 18:58:52 -04:00
$author [ 'author-about' ] = HTML :: toBBCode ( XML :: getFirstNodeValue ( $xpath , '/rss/channel/description/text()' ), $basepath );
2022-03-13 11:29:07 -04:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-about' ])) {
$author [ 'author-about' ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/itunes:summary/text()' );
2022-03-13 11:29:07 -04:00
}
2022-06-19 18:58:52 -04:00
$author [ 'edited' ] = $author [ 'created' ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/pubDate/text()' );
2015-09-27 08:02:05 -04:00
2022-06-19 18:58:52 -04:00
$author [ 'app' ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/generator/text()' );
2015-09-27 08:02:05 -04:00
2017-12-13 02:03:42 -05:00
$entries = $xpath -> query ( '/rss/channel/item' );
}
2020-01-03 09:26:28 -05:00
if ( ! $dryRun ) {
2022-06-19 18:58:52 -04:00
$author [ 'author-link' ] = $contact [ 'url' ];
2015-09-27 08:02:05 -04:00
2022-06-19 18:58:52 -04:00
if ( empty ( $author [ 'author-name' ])) {
$author [ 'author-name' ] = $contact [ 'name' ];
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2022-06-19 18:58:52 -04:00
$author [ 'author-avatar' ] = $contact [ 'thumb' ];
2015-09-27 08:02:05 -04:00
2022-06-19 18:58:52 -04:00
$author [ 'owner-link' ] = $contact [ 'url' ];
$author [ 'owner-name' ] = $contact [ 'name' ];
$author [ 'owner-avatar' ] = $contact [ 'thumb' ];
2017-03-15 02:00:22 -04:00
}
2015-09-27 08:02:05 -04:00
2018-01-15 08:05:12 -05:00
$header = [];
2022-06-19 18:58:52 -04:00
$header [ 'uid' ] = $importer [ 'uid' ] ? ? 0 ;
$header [ 'network' ] = Protocol :: FEED ;
$header [ 'wall' ] = 0 ;
$header [ 'origin' ] = 0 ;
$header [ 'gravity' ] = GRAVITY_PARENT ;
$header [ 'private' ] = Item :: PUBLIC ;
$header [ 'verb' ] = Activity :: POST ;
$header [ 'object-type' ] = Activity\ObjectType :: NOTE ;
$header [ 'post-type' ] = Item :: PT_ARTICLE ;
$header [ 'contact-id' ] = $contact [ 'id' ] ? ? 0 ;
2015-10-03 07:58:10 -04:00
2017-12-13 02:03:42 -05:00
if ( ! is_object ( $entries )) {
2020-01-03 09:26:28 -05:00
Logger :: info ( " There are no entries in this feed. " );
return [];
2017-12-13 02:03:42 -05:00
}
2016-02-14 09:02:59 -05:00
2018-01-15 08:05:12 -05:00
$items = [];
2020-08-16 13:59:37 -04:00
$creation_dates = [];
2020-03-22 09:05:35 -04:00
// Limit the number of items that are about to be fetched
$total_items = ( $entries -> length - 1 );
$max_items = DI :: config () -> get ( 'system' , 'max_feed_items' );
if (( $max_items > 0 ) && ( $total_items > $max_items )) {
$total_items = $max_items ;
}
2020-11-30 00:39:12 -05:00
$postings = [];
2018-03-10 12:40:21 -05:00
// Importing older entries first
2020-03-22 09:05:35 -04:00
for ( $i = $total_items ; $i >= 0 ; -- $i ) {
2018-03-10 12:40:21 -05:00
$entry = $entries -> item ( $i );
2015-10-14 02:10:06 -04:00
2017-12-13 02:03:42 -05:00
$item = array_merge ( $header , $author );
2018-07-10 08:27:56 -04:00
$alternate = XML :: getFirstAttributes ( $xpath , " atom:link[@rel='alternate'] " , $entry );
2017-12-13 02:03:42 -05:00
if ( ! is_object ( $alternate )) {
2022-06-19 18:58:52 -04:00
$alternate = XML :: getFirstAttributes ( $xpath , 'atom:link' , $entry );
2017-12-13 02:03:42 -05:00
}
if ( is_object ( $alternate )) {
2021-10-03 06:34:41 -04:00
foreach ( $alternate as $attribute ) {
2022-06-19 18:58:52 -04:00
if ( $attribute -> name == 'href' ) {
$item [ 'plink' ] = $attribute -> textContent ;
2017-12-13 02:03:42 -05:00
}
2017-03-15 02:00:22 -04:00
}
}
2020-01-03 09:09:03 -05:00
2022-06-19 18:58:52 -04:00
if ( empty ( $item [ 'plink' ])) {
$item [ 'plink' ] = XML :: getFirstNodeValue ( $xpath , 'link/text()' , $entry );
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2022-06-19 18:58:52 -04:00
if ( empty ( $item [ 'plink' ])) {
$item [ 'plink' ] = XML :: getFirstNodeValue ( $xpath , 'rss:link/text()' , $entry );
2017-12-13 02:03:42 -05:00
}
2015-09-27 08:02:05 -04:00
2022-03-13 11:29:07 -04:00
// Add the base path if missing
2022-06-19 18:58:52 -04:00
$item [ 'plink' ] = Network :: addBasePath ( $item [ 'plink' ], $basepath );
2022-03-13 11:29:07 -04:00
2022-06-19 18:58:52 -04:00
$item [ 'uri' ] = XML :: getFirstNodeValue ( $xpath , 'atom:id/text()' , $entry );
2015-09-27 08:02:05 -04:00
2022-03-13 11:29:07 -04:00
$guid = XML :: getFirstNodeValue ( $xpath , 'guid/text()' , $entry );
if ( ! empty ( $guid )) {
2022-06-19 18:58:52 -04:00
$item [ 'uri' ] = $guid ;
2022-03-14 17:09:14 -04:00
// Don't use the GUID value directly but instead use it as a basis for the GUID
2022-06-20 14:01:25 -04:00
$item [ 'guid' ] = Item :: guidFromUri ( $guid , parse_url ( $guid , PHP_URL_HOST ) ? ? parse_url ( $item [ 'plink' ], PHP_URL_HOST ) ? ? '' );
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2022-06-19 18:58:52 -04:00
if ( empty ( $item [ 'uri' ])) {
$item [ 'uri' ] = $item [ 'plink' ];
2017-12-13 02:03:42 -05:00
}
2017-04-08 04:12:14 -04:00
2022-06-19 18:58:52 -04:00
$orig_plink = $item [ 'plink' ];
2017-04-08 04:12:14 -04:00
2021-08-29 07:37:08 -04:00
try {
2022-06-19 18:58:52 -04:00
$item [ 'plink' ] = DI :: httpClient () -> finalUrl ( $item [ 'plink' ]);
2021-08-29 07:37:08 -04:00
} catch ( TransferException $exception ) {
2022-06-19 18:58:52 -04:00
Logger :: notice ( 'Item URL couldn\'t get expanded' , [ 'url' => $item [ 'plink' ], 'exception' => $exception ]);
2021-08-29 07:37:08 -04:00
}
2017-04-08 04:12:14 -04:00
2022-06-19 18:58:52 -04:00
$item [ 'title' ] = XML :: getFirstNodeValue ( $xpath , 'atom:title/text()' , $entry );
2017-01-31 14:39:09 -05:00
2022-06-19 18:58:52 -04:00
if ( empty ( $item [ 'title' ])) {
$item [ 'title' ] = XML :: getFirstNodeValue ( $xpath , 'title/text()' , $entry );
2017-12-13 02:03:42 -05:00
}
2022-03-13 11:29:07 -04:00
2022-06-19 18:58:52 -04:00
if ( empty ( $item [ 'title' ])) {
$item [ 'title' ] = XML :: getFirstNodeValue ( $xpath , 'rss:title/text()' , $entry );
2017-12-13 02:03:42 -05:00
}
2019-08-18 09:37:31 -04:00
2022-06-19 18:58:52 -04:00
if ( empty ( $item [ 'title' ])) {
$item [ 'title' ] = XML :: getFirstNodeValue ( $xpath , 'itunes:title/text()' , $entry );
2022-03-13 11:29:07 -04:00
}
2022-06-19 18:58:52 -04:00
$item [ 'title' ] = html_entity_decode ( $item [ 'title' ], ENT_QUOTES , 'UTF-8' );
2019-08-18 09:37:31 -04:00
2018-07-08 07:46:05 -04:00
$published = XML :: getFirstNodeValue ( $xpath , 'atom:published/text()' , $entry );
2015-09-27 08:02:05 -04:00
2018-07-10 08:27:56 -04:00
if ( empty ( $published )) {
2018-07-08 07:46:05 -04:00
$published = XML :: getFirstNodeValue ( $xpath , 'pubDate/text()' , $entry );
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2018-07-10 08:27:56 -04:00
if ( empty ( $published )) {
2018-07-08 07:46:05 -04:00
$published = XML :: getFirstNodeValue ( $xpath , 'dc:date/text()' , $entry );
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2018-07-08 07:46:05 -04:00
$updated = XML :: getFirstNodeValue ( $xpath , 'atom:updated/text()' , $entry );
2015-09-27 08:02:05 -04:00
2018-08-29 15:11:43 -04:00
if ( empty ( $updated ) && ! empty ( $published )) {
2017-12-13 02:03:42 -05:00
$updated = $published ;
}
2018-08-29 15:11:43 -04:00
if ( empty ( $published ) && ! empty ( $updated )) {
$published = $updated ;
}
2022-06-19 18:58:52 -04:00
if ( $published != '' ) {
$item [ 'created' ] = $published ;
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2022-06-19 18:58:52 -04:00
if ( $updated != '' ) {
$item [ 'edited' ] = $updated ;
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2020-08-16 13:59:37 -04:00
if ( ! $dryRun ) {
$condition = [ " `uid` = ? AND `uri` = ? AND `network` IN (?, ?) " ,
2022-06-19 18:58:52 -04:00
$importer [ 'uid' ], $item [ 'uri' ], Protocol :: FEED , Protocol :: DFRN ];
2021-01-15 23:14:58 -05:00
$previous = Post :: selectFirst ([ 'id' , 'created' ], $condition );
2020-08-16 13:59:37 -04:00
if ( DBA :: isResult ( $previous )) {
// Use the creation date when the post had been stored. It can happen this date changes in the feed.
$creation_dates [] = $previous [ 'created' ];
Logger :: info ( " Item with uri " . $item [ " uri " ] . " for user " . $importer [ " uid " ] . " already existed under id " . $previous [ " id " ]);
continue ;
}
$creation_dates [] = DateTimeFormat :: utc ( $item [ 'created' ]);
}
2018-07-08 07:46:05 -04:00
$creator = XML :: getFirstNodeValue ( $xpath , 'author/text()' , $entry );
2015-09-27 08:02:05 -04:00
2018-07-10 08:27:56 -04:00
if ( empty ( $creator )) {
2018-07-08 07:46:05 -04:00
$creator = XML :: getFirstNodeValue ( $xpath , 'atom:author/atom:name/text()' , $entry );
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2018-07-10 08:27:56 -04:00
if ( empty ( $creator )) {
2018-07-08 07:46:05 -04:00
$creator = XML :: getFirstNodeValue ( $xpath , 'dc:creator/text()' , $entry );
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2022-06-19 18:58:52 -04:00
if ( $creator != '' ) {
$item [ 'author-name' ] = $creator ;
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2018-07-08 07:46:05 -04:00
$creator = XML :: getFirstNodeValue ( $xpath , 'dc:creator/text()' , $entry );
2015-09-27 08:02:05 -04:00
2022-06-19 18:58:52 -04:00
if ( $creator != '' ) {
$item [ 'author-name' ] = $creator ;
2017-12-13 02:03:42 -05:00
}
2017-04-04 13:47:45 -04:00
2017-12-13 02:03:42 -05:00
/// @TODO ?
// <category>Ausland</category>
// <media:thumbnail width="152" height="76" url="http://www.taz.de/picture/667875/192/14388767.jpg"/>
2018-01-15 08:05:12 -05:00
$attachments = [];
2017-12-13 02:03:42 -05:00
2018-03-10 12:40:21 -05:00
$enclosures = $xpath -> query ( " enclosure|atom:link[@rel='enclosure'] " , $entry );
2021-10-03 06:34:41 -04:00
foreach ( $enclosures as $enclosure ) {
2022-06-19 18:58:52 -04:00
$href = '' ;
2020-11-07 03:22:59 -05:00
$length = null ;
$type = null ;
2017-12-13 02:03:42 -05:00
2021-10-03 06:34:41 -04:00
foreach ( $enclosure -> attributes as $attribute ) {
2022-06-19 18:58:52 -04:00
if ( in_array ( $attribute -> name , [ 'url' , 'href' ])) {
2018-03-10 18:35:24 -05:00
$href = $attribute -> textContent ;
2022-06-19 18:58:52 -04:00
} elseif ( $attribute -> name == 'length' ) {
2020-11-04 01:58:04 -05:00
$length = ( int ) $attribute -> textContent ;
2022-06-19 18:58:52 -04:00
} elseif ( $attribute -> name == 'type' ) {
2018-03-10 18:35:24 -05:00
$type = $attribute -> textContent ;
2017-12-13 02:03:42 -05:00
}
}
2020-01-03 09:09:03 -05:00
2020-11-07 03:22:59 -05:00
if ( ! empty ( $href )) {
2021-12-08 08:32:20 -05:00
$attachment = [ 'type' => Post\Media :: UNKNOWN , 'url' => $href , 'mimetype' => $type , 'size' => $length ];
$attachment = Post\Media :: fetchAdditionalData ( $attachment );
// By now we separate the visible media types (audio, video, image) from the rest
// In the future we should try to avoid the DOCUMENT type and only use the real one - but not in the RC phase.
if ( ! in_array ( $attachment [ 'type' ], [ Post\Media :: AUDIO , Post\Media :: IMAGE , Post\Media :: VIDEO ])) {
$attachment [ 'type' ] = Post\Media :: DOCUMENT ;
}
$attachments [] = $attachment ;
2017-03-15 02:00:22 -04:00
}
2017-12-13 02:03:42 -05:00
}
2015-09-27 08:02:05 -04:00
2020-04-15 01:10:40 -04:00
$taglist = [];
2022-06-19 18:58:52 -04:00
$categories = $xpath -> query ( 'category' , $entry );
2021-10-03 06:34:41 -04:00
foreach ( $categories as $category ) {
2020-05-02 01:08:05 -04:00
$taglist [] = $category -> nodeValue ;
2017-12-13 02:03:42 -05:00
}
2017-12-12 00:35:41 -05:00
2018-07-10 08:27:56 -04:00
$body = trim ( XML :: getFirstNodeValue ( $xpath , 'atom:content/text()' , $entry ));
2017-10-17 05:10:19 -04:00
2018-07-10 08:27:56 -04:00
if ( empty ( $body )) {
$body = trim ( XML :: getFirstNodeValue ( $xpath , 'content:encoded/text()' , $entry ));
2017-12-13 02:03:42 -05:00
}
2019-03-16 06:59:11 -04:00
$summary = trim ( XML :: getFirstNodeValue ( $xpath , 'atom:summary/text()' , $entry ));
if ( empty ( $summary )) {
$summary = trim ( XML :: getFirstNodeValue ( $xpath , 'description/text()' , $entry ));
2017-12-13 02:03:42 -05:00
}
2019-03-16 06:59:11 -04:00
2018-07-10 08:27:56 -04:00
if ( empty ( $body )) {
2019-03-16 06:59:11 -04:00
$body = $summary ;
$summary = '' ;
}
if ( $body == $summary ) {
$summary = '' ;
2017-12-13 02:03:42 -05:00
}
2017-10-17 05:10:19 -04:00
2017-12-13 02:03:42 -05:00
// remove the content of the title if it is identically to the body
// This helps with auto generated titles e.g. from tumblr
2022-06-19 18:58:52 -04:00
if ( self :: titleIsBody ( $item [ 'title' ], $body )) {
$item [ 'title' ] = '' ;
2017-12-13 02:03:42 -05:00
}
2022-06-19 18:58:52 -04:00
$item [ 'body' ] = HTML :: toBBCode ( $body , $basepath );
2017-10-17 05:10:19 -04:00
2020-09-18 11:25:48 -04:00
// Remove tracking pixels
2022-06-19 18:58:52 -04:00
$item [ 'body' ] = preg_replace ( " / \ [img=1x1 \ ]([^ \ [ \ ]]*) \ [ \ /img \ ]/Usi " , '' , $item [ 'body' ]);
2020-09-18 11:25:48 -04:00
2022-06-19 18:58:52 -04:00
if (( $item [ 'body' ] == '' ) && ( $item [ 'title' ] != '' )) {
$item [ 'body' ] = $item [ 'title' ];
$item [ 'title' ] = '' ;
2017-12-13 02:03:42 -05:00
}
2017-10-17 05:10:19 -04:00
2020-08-15 16:05:08 -04:00
if ( $dryRun ) {
2022-03-13 11:29:07 -04:00
$item [ 'attachments' ] = $attachments ;
2020-08-15 16:05:08 -04:00
$items [] = $item ;
break ;
} elseif ( ! Item :: isValid ( $item )) {
2020-11-30 15:32:56 -05:00
Logger :: info ( 'Feed item is invalid' , [ 'created' => $item [ 'created' ], 'uid' => $item [ 'uid' ], 'uri' => $item [ 'uri' ]]);
continue ;
2020-11-30 15:59:18 -05:00
} elseif ( Item :: isTooOld ( $item )) {
2020-11-30 15:32:56 -05:00
Logger :: info ( 'Feed is too old' , [ 'created' => $item [ 'created' ], 'uid' => $item [ 'uid' ], 'uri' => $item [ 'uri' ]]);
2020-08-15 16:05:08 -04:00
continue ;
}
2018-02-13 23:58:46 -05:00
$preview = '' ;
2022-06-19 18:58:52 -04:00
if ( ! empty ( $contact [ 'fetch_further_information' ]) && ( $contact [ 'fetch_further_information' ] < 3 )) {
2017-12-13 02:03:42 -05:00
// Handle enclosures and treat them as preview picture
2021-10-03 06:34:41 -04:00
foreach ( $attachments as $attachment ) {
2022-06-19 18:58:52 -04:00
if ( $attachment [ 'mimetype' ] == 'image/jpeg' ) {
$preview = $attachment [ 'url' ];
2017-12-13 02:03:42 -05:00
}
2017-03-15 02:00:22 -04:00
}
2015-09-27 08:02:05 -04:00
2017-12-13 02:03:42 -05:00
// Remove a possible link to the item itself
2022-06-19 18:58:52 -04:00
$item [ 'body' ] = str_replace ( $item [ 'plink' ], '' , $item [ 'body' ]);
$item [ 'body' ] = trim ( preg_replace ( '/\[url\=\](\w+.*?)\[\/url\]/i' , '' , $item [ 'body' ]));
2017-10-17 07:39:09 -04:00
2017-12-13 02:03:42 -05:00
// Replace the content when the title is longer than the body
2022-06-19 18:58:52 -04:00
$replace = ( strlen ( $item [ 'title' ]) > strlen ( $item [ 'body' ]));
2017-10-17 05:58:29 -04:00
2017-12-13 02:03:42 -05:00
// Replace it, when there is an image in the body
2022-06-19 18:58:52 -04:00
if ( strstr ( $item [ 'body' ], '[/img]' )) {
2017-12-13 02:03:42 -05:00
$replace = true ;
}
2017-10-17 05:58:29 -04:00
2017-12-13 02:03:42 -05:00
// Replace it, when there is a link in the body
2022-06-19 18:58:52 -04:00
if ( strstr ( $item [ 'body' ], '[/url]' )) {
2017-12-13 02:03:42 -05:00
$replace = true ;
}
2017-10-17 05:58:29 -04:00
2022-06-19 18:58:52 -04:00
$saved_body = $item [ 'body' ];
$saved_title = $item [ 'title' ];
2020-09-17 06:36:33 -04:00
2017-12-13 02:03:42 -05:00
if ( $replace ) {
2022-06-19 18:58:52 -04:00
$item [ 'body' ] = trim ( $item [ 'title' ]);
2017-12-13 02:03:42 -05:00
}
2019-11-18 07:29:27 -05:00
2021-03-16 03:15:20 -04:00
$data = ParseUrl :: getSiteinfoCached ( $item [ 'plink' ]);
2019-11-18 07:29:27 -05:00
if ( ! empty ( $data [ 'text' ]) && ! empty ( $data [ 'title' ]) && ( mb_strlen ( $item [ 'body' ]) < mb_strlen ( $data [ 'text' ]))) {
// When the fetched page info text is longer than the body, we do try to enhance the body
2019-11-18 13:09:21 -05:00
if ( ! empty ( $item [ 'body' ]) && ( strpos ( $data [ 'title' ], $item [ 'body' ]) === false ) && ( strpos ( $data [ 'text' ], $item [ 'body' ]) === false )) {
2019-11-18 07:29:27 -05:00
// The body is not part of the fetched page info title or page info text. So we add the text to the body
$item [ 'body' ] .= " \n \n " . $data [ 'text' ];
} else {
// Else we replace the body with the page info text
$item [ 'body' ] = $data [ 'text' ];
}
}
2022-06-19 18:58:52 -04:00
$data = PageInfo :: queryUrl ( $item [ 'plink' ], false , $preview , ( $contact [ 'fetch_further_information' ] == 2 ), $contact [ 'ffi_keyword_denylist' ] ? ? '' );
2020-09-17 06:36:33 -04:00
2020-11-12 00:17:48 -05:00
if ( ! empty ( $data )) {
// Take the data that was provided by the feed if the query is empty
if (( $data [ 'type' ] == 'link' ) && empty ( $data [ 'title' ]) && empty ( $data [ 'text' ])) {
$data [ 'title' ] = $saved_title ;
2022-06-19 18:58:52 -04:00
$item [ 'body' ] = $saved_body ;
2020-11-12 00:17:48 -05:00
}
2020-09-17 06:36:33 -04:00
2020-11-12 00:17:48 -05:00
$data_text = strip_tags ( trim ( $data [ 'text' ] ? ? '' ));
$item_body = strip_tags ( trim ( $item [ 'body' ] ? ? '' ));
2020-09-17 09:07:20 -04:00
2020-11-12 00:17:48 -05:00
if ( ! empty ( $data_text ) && (( $data_text == $item_body ) || strstr ( $item_body , $data_text ))) {
$data [ 'text' ] = '' ;
}
2020-09-17 09:07:20 -04:00
2020-11-12 00:17:48 -05:00
// We always strip the title since it will be added in the page information
2022-06-19 18:58:52 -04:00
$item [ 'title' ] = '' ;
$item [ 'body' ] = $item [ 'body' ] . " \n " . PageInfo :: getFooterFromData ( $data , false );
$taglist = $contact [ 'fetch_further_information' ] == 2 ? PageInfo :: getTagsFromUrl ( $item [ 'plink' ], $preview , $contact [ 'ffi_keyword_denylist' ] ? ? '' ) : [];
$item [ 'object-type' ] = Activity\ObjectType :: BOOKMARK ;
2020-11-12 00:17:48 -05:00
$attachments = [];
2021-12-03 23:03:18 -05:00
foreach ([ 'audio' , 'video' ] as $elementname ) {
if ( ! empty ( $data [ $elementname ])) {
foreach ( $data [ $elementname ] as $element ) {
if ( ! empty ( $element [ 'src' ])) {
$src = $element [ 'src' ];
} elseif ( ! empty ( $element [ 'content' ])) {
$src = $element [ 'content' ];
} else {
continue ;
}
$attachments [] = [
'type' => ( $elementname == 'audio' ) ? Post\Media :: AUDIO : Post\Media :: VIDEO ,
'url' => $src ,
'preview' => $element [ 'image' ] ? ? null ,
'mimetype' => $element [ 'contenttype' ] ? ? null ,
'name' => $element [ 'name' ] ? ? null ,
'description' => $element [ 'description' ] ? ? null ,
];
}
}
}
2020-11-12 00:17:48 -05:00
}
2017-12-13 02:03:42 -05:00
} else {
2019-03-16 06:59:11 -04:00
if ( ! empty ( $summary )) {
2022-06-19 18:58:52 -04:00
$item [ 'body' ] = '[abstract]' . HTML :: toBBCode ( $summary , $basepath ) . " [/abstract] \n " . $item [ 'body' ];
2019-03-16 06:59:11 -04:00
}
2022-06-19 18:58:52 -04:00
if ( ! empty ( $contact [ 'fetch_further_information' ]) && ( $contact [ 'fetch_further_information' ] == 3 )) {
2020-05-02 01:08:05 -04:00
if ( empty ( $taglist )) {
2022-06-19 18:58:52 -04:00
$taglist = PageInfo :: getTagsFromUrl ( $item [ 'plink' ], $preview , $contact [ 'ffi_keyword_denylist' ] ? ? '' );
2017-12-13 02:03:42 -05:00
}
2022-06-19 18:58:52 -04:00
$item [ 'body' ] .= " \n " . self :: tagToString ( $taglist );
2020-04-15 01:10:40 -04:00
} else {
$taglist = [];
2017-12-13 02:03:42 -05:00
}
2020-01-03 09:09:03 -05:00
2018-01-12 00:55:14 -05:00
// Add the link to the original feed entry if not present in feed
2022-06-19 18:58:52 -04:00
if (( $item [ 'plink' ] != '' ) && ! strstr ( $item [ 'body' ], $item [ 'plink' ]) && ! in_array ( $item [ 'plink' ], array_column ( $attachments , 'url' ))) {
$item [ 'body' ] .= '[hr][url]' . $item [ 'plink' ] . '[/url]' ;
2017-12-12 00:35:41 -05:00
}
2017-08-27 02:59:07 -04:00
}
2015-09-27 08:02:05 -04:00
2021-04-07 02:02:06 -04:00
if ( empty ( $item [ 'title' ])) {
$item [ 'post-type' ] = Item :: PT_NOTE ;
}
2020-08-15 16:05:08 -04:00
Logger :: info ( 'Stored feed' , [ 'item' => $item ]);
2015-09-27 08:02:05 -04:00
2020-08-15 16:05:08 -04:00
$notify = Item :: isRemoteSelf ( $contact , $item );
2016-11-14 01:55:17 -05:00
2020-08-15 16:05:08 -04:00
// Distributed items should have a well formatted URI.
// Additionally we have to avoid conflicts with identical URI between imported feeds and these items.
if ( $notify ) {
$item [ 'guid' ] = Item :: guidFromUri ( $orig_plink , DI :: baseUrl () -> getHostname ());
2020-12-01 17:11:29 -05:00
$item [ 'uri' ] = Item :: newURI ( $item [ 'uid' ], $item [ 'guid' ]);
2020-11-11 02:47:48 -05:00
unset ( $item [ 'thr-parent' ]);
2020-08-15 16:05:08 -04:00
unset ( $item [ 'parent-uri' ]);
2018-05-15 15:29:14 -04:00
2020-08-15 16:05:08 -04:00
// Set the delivery priority for "remote self" to "medium"
$notify = PRIORITY_MEDIUM ;
}
2020-04-15 07:39:00 -04:00
2020-12-01 19:29:57 -05:00
$condition = [ 'uid' => $item [ 'uid' ], 'uri' => $item [ 'uri' ]];
2022-06-19 18:58:52 -04:00
if ( ! Post :: exists ( $condition ) && ! Post\Delayed :: exists ( $item [ 'uri' ], $item [ 'uid' ])) {
2020-12-02 00:41:33 -05:00
if ( ! $notify ) {
Post\Delayed :: publish ( $item , $notify , $taglist , $attachments );
} else {
$postings [] = [ 'item' => $item , 'notify' => $notify ,
'taglist' => $taglist , 'attachments' => $attachments ];
}
2020-12-01 17:11:29 -05:00
} else {
2022-06-19 18:58:52 -04:00
Logger :: info ( 'Post already created or exists in the delayed posts queue' , [ 'uid' => $item [ 'uid' ], 'uri' => $item [ 'uri' ]]);
2020-12-01 17:11:29 -05:00
}
2020-11-30 00:39:12 -05:00
}
2015-09-27 08:02:05 -04:00
2020-11-30 00:39:12 -05:00
if ( ! empty ( $postings )) {
2020-12-01 17:11:29 -05:00
$min_posting = DI :: config () -> get ( 'system' , 'minimum_posting_interval' , 0 );
2020-11-30 00:39:12 -05:00
$total = count ( $postings );
if ( $total > 1 ) {
2020-11-30 01:19:10 -05:00
// Posts shouldn't be delayed more than a day
2020-11-30 03:59:29 -05:00
$interval = min ( 1440 , self :: getPollInterval ( $contact ));
2020-12-01 17:11:29 -05:00
$delay = max ( round (( $interval * 60 ) / $total ), 60 * $min_posting );
2020-12-02 03:36:23 -05:00
Logger :: info ( 'Got posting delay' , [ 'delay' => $delay , 'interval' => $interval , 'items' => $total , 'cid' => $contact [ 'id' ], 'url' => $contact [ 'url' ]]);
2020-11-30 00:39:12 -05:00
} else {
$delay = 0 ;
}
$post_delay = 0 ;
2020-04-15 01:10:40 -04:00
2020-11-30 00:39:12 -05:00
foreach ( $postings as $posting ) {
2020-12-02 00:08:39 -05:00
if ( $delay > 0 ) {
$publish_time = time () + $post_delay ;
$post_delay += $delay ;
} else {
$publish_time = time ();
}
2020-12-01 23:53:54 -05:00
2020-12-02 00:08:39 -05:00
$last_publish = DI :: pConfig () -> get ( $posting [ 'item' ][ 'uid' ], 'system' , 'last_publish' , 0 , true );
$next_publish = max ( $last_publish + ( 60 * $min_posting ), time ());
if ( $publish_time < $next_publish ) {
$publish_time = $next_publish ;
2020-04-15 01:10:40 -04:00
}
2020-12-01 17:11:29 -05:00
$publish_at = date ( DateTimeFormat :: MYSQL , $publish_time );
2020-11-30 00:39:12 -05:00
2021-08-06 14:49:17 -04:00
if ( Post\Delayed :: add ( $posting [ 'item' ][ 'uri' ], $posting [ 'item' ], $posting [ 'notify' ], Post\Delayed :: PREPARED , $publish_at , $posting [ 'taglist' ], $posting [ 'attachments' ])) {
2021-08-03 01:39:04 -04:00
DI :: pConfig () -> set ( $item [ 'uid' ], 'system' , 'last_publish' , $publish_time );
}
2017-12-13 02:03:42 -05:00
}
2017-03-15 02:00:22 -04:00
}
2017-12-13 02:03:42 -05:00
2020-08-16 17:38:26 -04:00
if ( ! $dryRun && DI :: config () -> get ( 'system' , 'adjust_poll_frequency' )) {
2020-08-16 13:59:37 -04:00
self :: adjustPollFrequency ( $contact , $creation_dates );
}
2022-06-19 18:58:52 -04:00
return [ 'header' => $author , 'items' => $items ];
2015-09-27 08:02:05 -04:00
}
2018-01-20 18:52:54 -05:00
2020-08-16 13:59:37 -04:00
/**
* Automatically adjust the poll frequency according to the post frequency
*
* @ param array $contact
* @ param array $creation_dates
* @ return void
*/
private static function adjustPollFrequency ( array $contact , array $creation_dates )
{
2020-08-17 11:49:34 -04:00
if ( $contact [ 'network' ] != Protocol :: FEED ) {
Logger :: info ( 'Contact is no feed, skip.' , [ 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ], 'network' => $contact [ 'network' ]]);
2020-08-16 13:59:37 -04:00
return ;
}
if ( ! empty ( $creation_dates )) {
// Count the post frequency and the earliest and latest post date
$frequency = [];
$oldest = time ();
$newest = 0 ;
$oldest_date = $newest_date = '' ;
foreach ( $creation_dates as $date ) {
$timestamp = strtotime ( $date );
$day = intdiv ( $timestamp , 86400 );
$hour = $timestamp % 86400 ;
// Only have a look at values from the last seven days
if ((( time () / 86400 ) - $day ) < 7 ) {
if ( empty ( $frequency [ $day ])) {
$frequency [ $day ] = [ 'count' => 1 , 'low' => $hour , 'high' => $hour ];
} else {
++ $frequency [ $day ][ 'count' ];
if ( $frequency [ $day ][ 'low' ] > $hour ) {
$frequency [ $day ][ 'low' ] = $hour ;
}
if ( $frequency [ $day ][ 'high' ] < $hour ) {
$frequency [ $day ][ 'high' ] = $hour ;
}
}
}
if ( $oldest > $day ) {
$oldest = $day ;
$oldest_date = $date ;
}
2020-11-30 01:06:43 -05:00
2020-08-16 13:59:37 -04:00
if ( $newest < $day ) {
$newest = $day ;
$newest_date = $date ;
}
}
2020-08-17 08:25:55 -04:00
if ( count ( $creation_dates ) == 1 ) {
Logger :: info ( 'Feed had posted a single time, switching to daily polling' , [ 'newest' => $newest_date , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
2020-08-17 02:50:51 -04:00
$priority = 8 ; // Poll once a day
2020-08-16 13:59:37 -04:00
}
2020-08-17 08:25:55 -04:00
if ( empty ( $priority ) && ((( time () / 86400 ) - $newest ) > 730 )) {
Logger :: info ( 'Feed had not posted for two years, switching to monthly polling' , [ 'newest' => $newest_date , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
$priority = 10 ; // Poll every month
}
if ( empty ( $priority ) && ((( time () / 86400 ) - $newest ) > 365 )) {
Logger :: info ( 'Feed had not posted for a year, switching to weekly polling' , [ 'newest' => $newest_date , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
$priority = 9 ; // Poll every week
}
if ( empty ( $priority ) && empty ( $frequency )) {
Logger :: info ( 'Feed had not posted for at least a week, switching to daily polling' , [ 'newest' => $newest_date , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
2020-08-17 02:50:51 -04:00
$priority = 8 ; // Poll once a day
2020-08-16 13:59:37 -04:00
}
if ( empty ( $priority )) {
// Calculate the highest "posts per day" value
2020-08-17 05:53:49 -04:00
$max = 0 ;
2020-08-16 13:59:37 -04:00
foreach ( $frequency as $entry ) {
if (( $entry [ 'count' ] == 1 ) || ( $entry [ 'high' ] == $entry [ 'low' ])) {
continue ;
}
// We take the earliest and latest post day and interpolate the number of post per day
// that would had been created with this post frequency
// Assume at least four hours between oldest and newest post per day - should be okay for news outlets
$duration = max ( $entry [ 'high' ] - $entry [ 'low' ], 14400 );
$ppd = ( 86400 / $duration ) * $entry [ 'count' ];
if ( $ppd > $max ) {
$max = $ppd ;
}
}
2020-08-17 02:47:29 -04:00
if ( $max > 48 ) {
$priority = 1 ; // Poll every quarter hour
} elseif ( $max > 24 ) {
$priority = 2 ; // Poll half an hour
} elseif ( $max > 12 ) {
$priority = 3 ; // Poll hourly
} elseif ( $max > 8 ) {
$priority = 4 ; // Poll every two hours
} elseif ( $max > 4 ) {
$priority = 5 ; // Poll every three hours
} elseif ( $max > 2 ) {
$priority = 6 ; // Poll every six hours
2020-08-16 13:59:37 -04:00
} else {
2020-08-17 08:25:55 -04:00
$priority = 7 ; // Poll twice a day
2020-08-16 13:59:37 -04:00
}
Logger :: info ( 'Calculated priority by the posts per day' , [ 'priority' => $priority , 'max' => round ( $max , 2 ), 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
}
} else {
Logger :: info ( 'No posts, switching to daily polling' , [ 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
2020-08-17 02:50:51 -04:00
$priority = 8 ; // Poll once a day
2020-08-16 13:59:37 -04:00
}
2020-08-16 17:38:26 -04:00
if ( $contact [ 'rating' ] != $priority ) {
Logger :: notice ( 'Adjusting priority' , [ 'old' => $contact [ 'rating' ], 'new' => $priority , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
2021-09-10 14:21:19 -04:00
Contact :: update ([ 'rating' => $priority ], [ 'id' => $contact [ 'id' ]]);
2020-08-16 13:59:37 -04:00
}
}
2020-11-30 01:19:10 -05:00
/**
* Get the poll interval for the given contact array
*
* @ param array $contact
* @ return int Poll interval in minutes
*/
public static function getPollInterval ( array $contact )
{
if ( in_array ( $contact [ 'network' ], [ Protocol :: MAIL , Protocol :: FEED ])) {
$ratings = [ 0 , 3 , 7 , 8 , 9 , 10 ];
if ( DI :: config () -> get ( 'system' , 'adjust_poll_frequency' ) && ( $contact [ 'network' ] == Protocol :: FEED )) {
$rating = $contact [ 'rating' ];
} elseif ( array_key_exists ( $contact [ 'priority' ], $ratings )) {
$rating = $ratings [ $contact [ 'priority' ]];
} else {
$rating = - 1 ;
}
} else {
// Check once a week per default for all other networks
$rating = 9 ;
}
// Friendica and OStatus are checked once a day
if ( in_array ( $contact [ 'network' ], [ Protocol :: DFRN , Protocol :: OSTATUS ])) {
$rating = 8 ;
}
// Check archived contacts or contacts with unsupported protocols once a month
if ( $contact [ 'archive' ] || in_array ( $contact [ 'network' ], [ Protocol :: ZOT , Protocol :: PHANTOM ])) {
$rating = 10 ;
}
if ( $rating < 0 ) {
return 0 ;
}
/*
* Based on $contact [ 'priority' ], should we poll this site now ? Or later ?
*/
$min_poll_interval = max ( 1 , DI :: config () -> get ( 'system' , 'min_poll_interval' ));
$poll_intervals = [ $min_poll_interval , 15 , 30 , 60 , 120 , 180 , 360 , 720 , 1440 , 10080 , 43200 ];
//$poll_intervals = [$min_poll_interval . ' minute', '15 minute', '30 minute',
// '1 hour', '2 hour', '3 hour', '6 hour', '12 hour' ,'1 day', '1 week', '1 month'];
return $poll_intervals [ $rating ];
}
2020-05-02 01:43:00 -04:00
/**
* Convert a tag array to a tag string
*
* @ param array $tags
* @ return string tag string
*/
private static function tagToString ( array $tags )
{
$tagstr = '' ;
foreach ( $tags as $tag ) {
if ( $tagstr != " " ) {
$tagstr .= " , " ;
}
2020-11-30 01:06:43 -05:00
2020-05-02 01:43:00 -04:00
$tagstr .= " #[url= " . DI :: baseUrl () . " /search?tag= " . urlencode ( $tag ) . " ] " . $tag . " [/url] " ;
}
return $tagstr ;
}
2018-01-20 18:52:54 -05:00
private static function titleIsBody ( $title , $body )
{
$title = strip_tags ( $title );
$title = trim ( $title );
$title = html_entity_decode ( $title , ENT_QUOTES , 'UTF-8' );
$title = str_replace ([ " \n " , " \r " , " \t " , " " ], [ " " , " " , " " , " " ], $title );
$body = strip_tags ( $body );
$body = trim ( $body );
$body = html_entity_decode ( $body , ENT_QUOTES , 'UTF-8' );
$body = str_replace ([ " \n " , " \r " , " \t " , " " ], [ " " , " " , " " , " " ], $body );
if ( strlen ( $title ) < strlen ( $body )) {
$body = substr ( $body , 0 , strlen ( $title ));
}
if (( $title != $body ) && ( substr ( $title , - 3 ) == " ... " )) {
$pos = strrpos ( $title , " ... " );
if ( $pos > 0 ) {
$title = substr ( $title , 0 , $pos );
$body = substr ( $body , 0 , $pos );
}
}
return ( $title == $body );
}
2020-07-17 00:40:20 -04:00
2020-07-17 00:46:42 -04:00
/**
2020-07-17 00:40:20 -04:00
* Creates the Atom feed for a given nickname
*
* Supported filters :
* - activity ( default ) : all the public posts
* - posts : all the public top - level posts
* - comments : all the public replies
*
* Updates the provided last_update parameter if the result comes from the
* cache or it is empty
*
* @ param string $owner_nick Nickname of the feed owner
* @ param string $last_update Date of the last update
* @ param integer $max_items Number of maximum items to fetch
* @ param string $filter Feed items filter ( activity , posts or comments )
* @ param boolean $nocache Wether to bypass caching
*
* @ return string Atom feed
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
* @ throws \ImagickException
*/
public static function atom ( $owner_nick , $last_update , $max_items = 300 , $filter = 'activity' , $nocache = false )
{
$stamp = microtime ( true );
$owner = User :: getOwnerDataByNick ( $owner_nick );
if ( ! $owner ) {
return ;
}
$cachekey = " feed:feed: " . $owner_nick . " : " . $filter . " : " . $last_update ;
2022-02-05 06:29:50 -05:00
// Display events in the users's timezone
if ( strlen ( $owner [ 'timezone' ])) {
DI :: app () -> setTimeZone ( $owner [ 'timezone' ]);
}
2020-07-17 00:40:20 -04:00
$previous_created = $last_update ;
// Don't cache when the last item was posted less then 15 minutes ago (Cache duration)
if (( time () - strtotime ( $owner [ 'last-item' ])) < 15 * 60 ) {
$result = DI :: cache () -> get ( $cachekey );
if ( ! $nocache && ! is_null ( $result )) {
Logger :: info ( 'Cached feed duration' , [ 'seconds' => number_format ( microtime ( true ) - $stamp , 3 ), 'nick' => $owner_nick , 'filter' => $filter , 'created' => $previous_created ]);
return $result [ 'feed' ];
}
}
$check_date = empty ( $last_update ) ? '' : DateTimeFormat :: utc ( $last_update );
2020-08-07 09:49:59 -04:00
$authorid = Contact :: getIdForURL ( $owner [ " url " ]);
2020-07-17 00:40:20 -04:00
$condition = [ " `uid` = ? AND `received` > ? AND NOT `deleted` AND `gravity` IN (?, ?)
AND `private` != ? AND `visible` AND `wall` AND `parent-network` IN ( ? , ? , ? , ? ) " ,
$owner [ " uid " ], $check_date , GRAVITY_PARENT , GRAVITY_COMMENT ,
Item :: PRIVATE , Protocol :: ACTIVITYPUB ,
Protocol :: OSTATUS , Protocol :: DFRN , Protocol :: DIASPORA ];
if ( $filter === 'comments' ) {
2021-07-23 08:39:37 -04:00
$condition [ 0 ] .= " AND `gravity` = ? " ;
$condition [] = GRAVITY_COMMENT ;
2020-07-17 00:40:20 -04:00
}
if ( $owner [ 'account-type' ] != User :: ACCOUNT_TYPE_COMMUNITY ) {
$condition [ 0 ] .= " AND `contact-id` = ? AND `author-id` = ? " ;
$condition [] = $owner [ " id " ];
$condition [] = $authorid ;
}
$params = [ 'order' => [ 'received' => true ], 'limit' => $max_items ];
if ( $filter === 'posts' ) {
2021-02-13 14:56:03 -05:00
$ret = Post :: selectThread ( Item :: DELIVER_FIELDLIST , $condition , $params );
2020-07-17 00:40:20 -04:00
} else {
2021-02-13 14:56:03 -05:00
$ret = Post :: select ( Item :: DELIVER_FIELDLIST , $condition , $params );
2020-07-17 00:40:20 -04:00
}
2021-01-19 02:23:01 -05:00
$items = Post :: toArray ( $ret );
2020-07-17 00:40:20 -04:00
$doc = new DOMDocument ( '1.0' , 'utf-8' );
$doc -> formatOutput = true ;
$root = self :: addHeader ( $doc , $owner , $filter );
foreach ( $items as $item ) {
2021-10-05 14:58:35 -04:00
$entry = self :: noteEntry ( $doc , $item , $owner );
2020-07-17 00:40:20 -04:00
$root -> appendChild ( $entry );
if ( $last_update < $item [ 'created' ]) {
$last_update = $item [ 'created' ];
}
}
$feeddata = trim ( $doc -> saveXML ());
$msg = [ 'feed' => $feeddata , 'last_update' => $last_update ];
DI :: cache () -> set ( $cachekey , $msg , Duration :: QUARTER_HOUR );
Logger :: info ( 'Feed duration' , [ 'seconds' => number_format ( microtime ( true ) - $stamp , 3 ), 'nick' => $owner_nick , 'filter' => $filter , 'created' => $previous_created ]);
return $feeddata ;
}
/**
* Adds the header elements to the XML document
*
* @ param DOMDocument $doc XML document
* @ param array $owner Contact data of the poster
* @ param string $filter The related feed filter ( activity , posts or comments )
*
* @ return object header root element
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
private static function addHeader ( DOMDocument $doc , array $owner , $filter )
{
$root = $doc -> createElementNS ( ActivityNamespace :: ATOM1 , 'feed' );
$doc -> appendChild ( $root );
$title = '' ;
$selfUri = '/feed/' . $owner [ " nick " ] . '/' ;
switch ( $filter ) {
case 'activity' :
$title = DI :: l10n () -> t ( '%s\'s timeline' , $owner [ 'name' ]);
$selfUri .= $filter ;
break ;
case 'posts' :
$title = DI :: l10n () -> t ( '%s\'s posts' , $owner [ 'name' ]);
break ;
case 'comments' :
$title = DI :: l10n () -> t ( '%s\'s comments' , $owner [ 'name' ]);
$selfUri .= $filter ;
break ;
}
$attributes = [ " uri " => " https://friendi.ca " , " version " => FRIENDICA_VERSION . " - " . DB_UPDATE_VERSION ];
XML :: addElement ( $doc , $root , " generator " , FRIENDICA_PLATFORM , $attributes );
XML :: addElement ( $doc , $root , " id " , DI :: baseUrl () . " /profile/ " . $owner [ " nick " ]);
XML :: addElement ( $doc , $root , " title " , $title );
XML :: addElement ( $doc , $root , " subtitle " , sprintf ( " Updates from %s on %s " , $owner [ " name " ], DI :: config () -> get ( 'config' , 'sitename' )));
2021-10-02 17:28:29 -04:00
XML :: addElement ( $doc , $root , " logo " , User :: getAvatarUrl ( $owner , Proxy :: SIZE_SMALL ));
2020-07-17 00:40:20 -04:00
XML :: addElement ( $doc , $root , " updated " , DateTimeFormat :: utcNow ( DateTimeFormat :: ATOM ));
$author = self :: addAuthor ( $doc , $owner );
$root -> appendChild ( $author );
$attributes = [ " href " => $owner [ " url " ], " rel " => " alternate " , " type " => " text/html " ];
XML :: addElement ( $doc , $root , " link " , " " , $attributes );
2022-06-21 23:44:57 -04:00
OStatus :: addHubLink ( $doc , $root , $owner [ " nick " ]);
2020-07-17 00:40:20 -04:00
$attributes = [ " href " => DI :: baseUrl () . $selfUri , " rel " => " self " , " type " => " application/atom+xml " ];
XML :: addElement ( $doc , $root , " link " , " " , $attributes );
return $root ;
}
/**
* Adds the author element to the XML document
*
* @ param DOMDocument $doc XML document
* @ param array $owner Contact data of the poster
*
* @ return \DOMElement author element
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
private static function addAuthor ( DOMDocument $doc , array $owner )
{
$author = $doc -> createElement ( " author " );
XML :: addElement ( $doc , $author , " uri " , $owner [ " url " ]);
XML :: addElement ( $doc , $author , " name " , $owner [ " nick " ]);
XML :: addElement ( $doc , $author , " email " , $owner [ " addr " ]);
return $author ;
}
/**
* Adds a regular entry element
*
* @ param DOMDocument $doc XML document
* @ param array $item Data of the item that is to be posted
* @ param array $owner Contact data of the poster
* @ param bool $toplevel Is it for en entry element ( false ) or a feed entry ( true ) ?
*
* @ return \DOMElement Entry element
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
* @ throws \ImagickException
*/
private static function noteEntry ( DOMDocument $doc , array $item , array $owner )
{
if (( $item [ 'gravity' ] != GRAVITY_PARENT ) && ( Strings :: normaliseLink ( $item [ " author-link " ]) != Strings :: normaliseLink ( $owner [ " url " ]))) {
Logger :: info ( 'Feed entry author does not match feed owner' , [ 'owner' => $owner [ " url " ], 'author' => $item [ " author-link " ]]);
}
$entry = OStatus :: entryHeader ( $doc , $owner , $item , false );
2020-07-17 01:27:45 -04:00
self :: entryContent ( $doc , $entry , $item , self :: getTitle ( $item ), '' , true );
2020-07-17 00:40:20 -04:00
self :: entryFooter ( $doc , $entry , $item , $owner );
return $entry ;
}
/**
* Adds elements to the XML document
*
* @ param DOMDocument $doc XML document
* @ param \DOMElement $entry Entry element where the content is added
* @ param array $item Data of the item that is to be posted
* @ param array $owner Contact data of the poster
* @ param string $title Title for the post
* @ param string $verb The activity verb
* @ param bool $complete Add the " status_net " element ?
* @ param bool $feed_mode Behave like a regular feed for users if true
* @ return void
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
private static function entryContent ( DOMDocument $doc , \DOMElement $entry , array $item , $title , $verb = " " , $complete = true )
{
if ( $verb == " " ) {
$verb = OStatus :: constructVerb ( $item );
}
XML :: addElement ( $doc , $entry , " id " , $item [ " uri " ]);
XML :: addElement ( $doc , $entry , " title " , html_entity_decode ( $title , ENT_QUOTES , 'UTF-8' ));
2021-07-05 17:35:57 -04:00
$body = OStatus :: formatPicturePost ( $item [ 'body' ], $item [ 'uri-id' ]);
2020-07-17 00:40:20 -04:00
2021-07-10 08:58:48 -04:00
$body = BBCode :: convertForUriId ( $item [ 'uri-id' ], $body , BBCode :: ACTIVITYPUB );
2020-07-17 00:40:20 -04:00
XML :: addElement ( $doc , $entry , " content " , $body , [ " type " => " html " ]);
XML :: addElement ( $doc , $entry , " link " , " " , [ " rel " => " alternate " , " type " => " text/html " ,
" href " => DI :: baseUrl () . " /display/ " . $item [ " guid " ]]
);
XML :: addElement ( $doc , $entry , " published " , DateTimeFormat :: utc ( $item [ " created " ] . " +00:00 " , DateTimeFormat :: ATOM ));
XML :: addElement ( $doc , $entry , " updated " , DateTimeFormat :: utc ( $item [ " edited " ] . " +00:00 " , DateTimeFormat :: ATOM ));
}
/**
* Adds the elements at the foot of an entry to the XML document
*
* @ param DOMDocument $doc XML document
* @ param object $entry The entry element where the elements are added
* @ param array $item Data of the item that is to be posted
* @ param array $owner Contact data of the poster
* @ param bool $complete default true
* @ return void
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
private static function entryFooter ( DOMDocument $doc , $entry , array $item , array $owner )
{
$mentioned = [];
if ( $item [ 'gravity' ] != GRAVITY_PARENT ) {
2021-01-15 23:14:58 -05:00
$parent = Post :: selectFirst ([ 'guid' , 'author-link' , 'owner-link' ], [ 'id' => $item [ 'parent' ]]);
2020-07-17 00:40:20 -04:00
2021-09-25 20:42:51 -04:00
$thrparent = Post :: selectFirst ([ 'guid' , 'author-link' , 'owner-link' , 'plink' ], [ 'uid' => $owner [ 'uid' ], 'uri' => $item [ 'thr-parent' ]]);
2020-07-17 00:40:20 -04:00
if ( DBA :: isResult ( $thrparent )) {
2021-09-25 20:42:51 -04:00
$mentioned [ $thrparent [ 'author-link' ]] = $thrparent [ 'author-link' ];
$mentioned [ $thrparent [ 'owner-link' ]] = $thrparent [ 'owner-link' ];
$parent_plink = $thrparent [ 'plink' ];
} elseif ( DBA :: isResult ( $parent )) {
$mentioned [ $parent [ 'author-link' ]] = $parent [ 'author-link' ];
$mentioned [ $parent [ 'owner-link' ]] = $parent [ 'owner-link' ];
$parent_plink = DI :: baseUrl () . '/display/' . $parent [ 'guid' ];
2020-07-17 00:40:20 -04:00
} else {
2021-09-25 20:42:51 -04:00
DI :: logger () -> notice ( 'Missing parent and thr-parent for child item' , [ 'item' => $item ]);
2020-07-17 00:40:20 -04:00
}
2021-09-25 20:42:51 -04:00
if ( isset ( $parent_plink )) {
$attributes = [
'ref' => $item [ 'thr-parent' ],
'href' => $parent_plink ];
XML :: addElement ( $doc , $entry , 'thr:in-reply-to' , '' , $attributes );
2020-07-17 00:40:20 -04:00
2021-09-25 20:42:51 -04:00
$attributes = [
'rel' => 'related' ,
'href' => $parent_plink ];
XML :: addElement ( $doc , $entry , 'link' , '' , $attributes );
}
2020-07-17 00:40:20 -04:00
}
// uri-id isn't present for follow entry pseudo-items
$tags = Tag :: getByURIId ( $item [ 'uri-id' ] ? ? 0 );
foreach ( $tags as $tag ) {
$mentioned [ $tag [ 'url' ]] = $tag [ 'url' ];
}
foreach ( $tags as $tag ) {
if ( $tag [ 'type' ] == Tag :: HASHTAG ) {
2021-09-25 20:42:51 -04:00
XML :: addElement ( $doc , $entry , 'category' , '' , [ 'term' => $tag [ 'name' ]]);
2020-07-17 00:40:20 -04:00
}
}
OStatus :: getAttachment ( $doc , $entry , $item );
}
2020-07-17 01:27:45 -04:00
/**
* Fetch or create title for feed entry
*
* @ param array $item
* @ return string title
*/
private static function getTitle ( array $item )
{
if ( $item [ 'title' ] != '' ) {
2021-07-10 08:58:48 -04:00
return BBCode :: convertForUriId ( $item [ 'uri-id' ], $item [ 'title' ], BBCode :: ACTIVITYPUB );
2020-07-17 01:27:45 -04:00
}
// Fetch information about the post
$siteinfo = BBCode :: getAttachedData ( $item [ " body " ]);
if ( isset ( $siteinfo [ " title " ])) {
return $siteinfo [ " title " ];
}
// If no bookmark is found then take the first line
// Remove the share element before fetching the first line
$title = trim ( preg_replace ( " / \ [share.*? \ ](.*?) \ [ \ /share \ ]/ism " , " \n $ 1 \n " , $item [ 'body' ]));
2021-07-05 03:00:35 -04:00
$title = BBCode :: toPlaintext ( $title ) . " \n " ;
2020-07-17 01:27:45 -04:00
$pos = strpos ( $title , " \n " );
$trailer = " " ;
if (( $pos == 0 ) || ( $pos > 100 )) {
$pos = 100 ;
$trailer = " ... " ;
}
return substr ( $title , 0 , $pos ) . $trailer ;
}
2015-09-27 08:02:05 -04:00
}