2019-04-20 08:15:45 -04:00
< ? php
/**
2019-04-20 14:38:32 -04:00
* Name : blockbot
2019-04-20 08:15:45 -04:00
* Description : Blocking bots based on detecting bots / crawlers / spiders via the user agent and http_from header .
2019-07-28 03:49:30 -04:00
* Version : 0.2
2019-04-20 08:15:45 -04:00
* Author : Philipp Holzer < admin @ philipp . info >
2019-07-28 03:49:30 -04:00
* Author : Michael Vogel < https :// pirati . ca / profile / heluecht >
2019-04-20 08:15:45 -04:00
*
*/
use Friendica\App ;
use Friendica\Core\Hook ;
use Friendica\Core\System ;
2020-01-18 16:07:06 -05:00
use Friendica\DI ;
2019-04-20 08:15:45 -04:00
use Jaybizzle\CrawlerDetect\CrawlerDetect ;
2019-04-27 07:51:44 -04:00
use Friendica\Core\Logger ;
2019-07-28 03:49:30 -04:00
use Friendica\Core\Renderer ;
2019-04-20 08:15:45 -04:00
2019-04-21 06:35:33 -04:00
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php' ;
2019-04-20 14:38:32 -04:00
function blockbot_install () {
2019-04-22 04:49:40 -04:00
Hook :: register ( 'init_1' , __FILE__ , 'blockbot_init_1' );
2019-04-20 08:15:45 -04:00
}
2019-04-20 14:38:32 -04:00
function blockbot_uninstall () {
2019-04-22 04:49:40 -04:00
Hook :: unregister ( 'init_1' , __FILE__ , 'blockbot_init_1' );
2019-04-20 08:15:45 -04:00
}
2019-07-28 03:49:30 -04:00
function blockbot_addon_admin ( & $a , & $o ) {
$t = Renderer :: getMarkupTemplate ( " admin.tpl " , " addon/blockbot/ " );
$o = Renderer :: replaceMacros ( $t , [
2020-01-18 14:52:33 -05:00
'$submit' => DI :: l10n () -> t ( 'Save Settings' ),
2020-01-19 15:21:12 -05:00
'$good_crawlers' => [ 'good_crawlers' , DI :: l10n () -> t ( 'Allow "good" crawlers' ), DI :: config () -> get ( 'blockbot' , 'good_crawlers' ), " Don't block fediverse crawlers, relay servers and other bots with good purposes. " ],
'$block_gab' => [ 'block_gab' , DI :: l10n () -> t ( 'Block GabSocial' ), DI :: config () -> get ( 'blockbot' , 'block_gab' ), 'Block the software GabSocial. This will block every access for that software. You can block dedicated gab instances in the blocklist settings in the admin section.' ],
'$training' => [ 'training' , DI :: l10n () -> t ( 'Training mode' ), DI :: config () -> get ( 'blockbot' , 'training' ), " Activates the training mode. This is only meant for developing purposes. Don't activate this on a production machine. This can cut communication with some systems. " ],
2019-07-28 03:49:30 -04:00
]);
}
function blockbot_addon_admin_post ( & $a ) {
2020-01-19 15:21:52 -05:00
DI :: config () -> set ( 'blockbot' , 'good_crawlers' , $_POST [ 'good_crawlers' ] ? ? false );
DI :: config () -> set ( 'blockbot' , 'block_gab' , $_POST [ 'block_gab' ] ? ? false );
DI :: config () -> set ( 'blockbot' , 'training' , $_POST [ 'training' ] ? ? false );
2020-01-18 14:52:33 -05:00
info ( DI :: l10n () -> t ( 'Settings updated.' ) . EOL );
2019-07-28 03:49:30 -04:00
}
2019-04-20 14:38:32 -04:00
function blockbot_init_1 ( App $a ) {
2019-05-03 06:25:13 -04:00
if ( empty ( $_SERVER [ 'HTTP_USER_AGENT' ])) {
return ;
}
2019-04-27 09:34:51 -04:00
$logdata = [ 'agent' => $_SERVER [ 'HTTP_USER_AGENT' ], 'uri' => $_SERVER [ 'REQUEST_URI' ]];
2019-07-28 04:13:53 -04:00
// List of "good" crawlers
$good_agents = [ 'fediverse.space crawler' , 'fediverse.network crawler' , 'Active_Pods_CheckBot_3.0' ,
2020-01-18 07:47:20 -05:00
'Social-Relay/' , 'Test Certificate Info' , 'Uptimebot/' , 'GNUSocialBot' , 'UptimeRobot/' ];
2019-07-28 04:13:53 -04:00
2019-05-30 00:45:20 -04:00
// List of known crawlers.
2019-06-23 05:13:59 -04:00
$agents = [ 'SemrushBot' , 's~feedly-nikon3' , 'Qwantify/Bleriot/' , 'ltx71' , 'Sogou web spider/' ,
2019-06-20 12:03:13 -04:00
'Diffbot/' , 'Twitterbot/' , 'YisouSpider' , 'evc-batch/' , 'LivelapBot/' , 'TrendsmapResolver/' ,
2019-05-29 14:51:07 -04:00
'PaperLiBot/' , 'Nuzzel' , 'um-LN/' , 'Google Favicon' , 'Datanyze' , 'BLEXBot/' , '360Spider' ,
'adscanner/' , 'HeadlessChrome' , 'wpif' , 'startmebot/' , 'Googlebot/' , 'Applebot/' ,
2019-05-29 15:13:15 -04:00
'facebookexternalhit/' , 'GoogleImageProxy' , 'bingbot/' , 'heritrix/' , 'ldspider' ,
2019-05-31 11:37:33 -04:00
'AwarioRssBot/' , 'Zabbix' , 'TweetmemeBot/' , 'dcrawl/' , 'PhantomJS/' , 'Googlebot-Image/' ,
2019-06-01 00:51:01 -04:00
'CrowdTanglebot/' , 'Mediapartners-Google' , 'Baiduspider/' , 'datagnionbot' ,
2019-06-06 16:31:16 -04:00
'MegaIndex.ru/' , 'SMUrlExpander' , 'Hatena-Favicon/' , 'Wappalyzer' , 'FlipboardProxy/' ,
'NetcraftSurveyAgent/' , 'Dataprovider.com' , 'SMTBot/' , 'Nimbostratus-Bot/' ,
2019-06-10 10:33:42 -04:00
'DuckDuckGo-Favicons-Bot/' , 'IndieWebCards/' , 'proximic' , 'netEstate NE Crawler' ,
2019-06-17 10:07:37 -04:00
'AhrefsBot/' , 'YandexBot/' , 'Exabot/' , 'Mediumbot-MetaTagFetcher/' , 'WhatsApp/' ,
'TelegramBot' , 'SurdotlyBot/' , 'BingPreview/' , 'SabsimBot/' , 'CCBot/' , 'WbSrch/' ,
2019-06-20 01:31:53 -04:00
'DuckDuckBot-Https/' , 'HTTP Banner Detection' , 'YandexImages/' , 'archive.org_bot' ,
'ArchiveTeam ArchiveBot/' , 'yacybot' , 'https://developers.google.com/+/web/snippet/' ,
2019-06-20 12:03:13 -04:00
'Scrapy/' , 'github-camo' , 'MJ12bot/' , 'DotBot/' , 'Pinterestbot/' , 'Jooblebot/' ,
2019-07-09 22:16:57 -04:00
'Cliqzbot/' , 'YaK/' , 'Mediatoolkitbot' , 'Snacktory' , 'FunWebProducts' , 'oBot/' ,
2019-07-29 11:48:51 -04:00
'7Siters/' , 'KOCMOHABT' , 'Google-SearchByImage' , 'FemtosearchBot/' ,
2020-07-05 10:36:17 -04:00
'HubSpot Crawler' , 'DomainStatsBot/' , 'Re-re Studio' , 'AwarioSmartBot/' ,
2020-07-07 14:51:42 -04:00
'SummalyBot/' , 'DNSResearchBot/' , 'PetalBot;' , 'Nmap Scripting Engine;' ,
2020-07-08 15:24:00 -04:00
'Google-Apps-Script; beanserver;' , 'woorankreview/' , 'Seekport Crawler;' , 'AHC/' ];
2019-07-28 03:49:30 -04:00
2020-01-19 15:21:12 -05:00
if ( ! DI :: config () -> get ( 'blockbot' , 'good_crawlers' )) {
2019-07-28 04:13:53 -04:00
$agents = array_merge ( $agents , $good_agents );
2020-01-18 17:28:14 -05:00
} else {
foreach ( $good_agents as $good_agent ) {
2020-01-18 07:47:20 -05:00
if ( stristr ( $_SERVER [ 'HTTP_USER_AGENT' ], $good_agent )) {
return ;
}
}
}
2019-07-28 04:13:53 -04:00
2020-01-19 15:21:12 -05:00
if ( DI :: config () -> get ( 'blockbot' , 'block_gab' )) {
2019-07-28 03:49:30 -04:00
$agents [] = 'GabSocial/' ;
}
2019-05-29 14:51:07 -04:00
foreach ( $agents as $agent ) {
if ( stristr ( $_SERVER [ 'HTTP_USER_AGENT' ], $agent )) {
System :: httpExit ( 403 , 'Bots are not allowed' );
}
}
2019-05-30 00:45:20 -04:00
// This switch here is only meant for developers who want to add more bots to the list above, it is not safe for production.
2020-01-19 15:21:12 -05:00
if ( ! DI :: config () -> get ( 'blockbot' , 'training' )) {
2019-05-29 14:51:07 -04:00
return ;
}
$crawlerDetect = new CrawlerDetect ();
2019-04-27 09:34:51 -04:00
if ( ! $crawlerDetect -> isCrawler ()) {
logger :: debug ( 'Good user agent detected' , $logdata );
return ;
}
2019-04-29 16:21:42 -04:00
// List of false positives' strings of known "good" agents.
2019-07-28 04:13:53 -04:00
$agents = [ 'curl' , 'zgrab' , 'Go-http-client' , 'curb' , 'github.com' , 'reqwest' , 'Feedly/' ,
2019-04-29 16:21:42 -04:00
'Python-urllib/' , 'Liferea/' , 'aiohttp/' , 'WordPress.com Reader' , 'hackney/' ,
'Faraday v' , 'okhttp' , 'UniversalFeedParser' , 'PixelFedBot' , 'python-requests' ,
2019-05-30 06:32:01 -04:00
'WordPress/' , 'http.rb/' , 'Apache-HttpClient/' , 'WordPress.com;' , 'Pleroma' ,
2019-07-28 04:13:53 -04:00
'Dispatch/' , 'Ruby' , 'Java/' , 'libwww-perl/' , 'Mastodon/' ,
2020-07-08 15:24:00 -04:00
'lua-resty-http/' , 'Tiny Tiny RSS/' , 'Wget/' , 'PostmanRuntime/' ,
'W3C_Validator/' , 'NetNewsWire' ];
2019-07-28 04:13:53 -04:00
2020-01-19 15:21:12 -05:00
if ( DI :: config () -> get ( 'blockbot' , 'good_crawlers' )) {
2019-07-28 04:13:53 -04:00
$agents = array_merge ( $agents , $good_agents );
}
2019-04-27 09:34:51 -04:00
foreach ( $agents as $agent ) {
if ( stristr ( $_SERVER [ 'HTTP_USER_AGENT' ], $agent )) {
2019-04-29 16:21:42 -04:00
logger :: notice ( 'False positive' , $logdata );
2019-04-27 09:34:51 -04:00
return ;
}
}
logger :: info ( 'Blocked bot' , $logdata );
2019-05-29 14:51:07 -04:00
System :: httpExit ( 403 , 'Bots are not allowed' );
2019-04-20 08:15:45 -04:00
}