File: /home/beaupptk/birthkumar.shop/wp-content/plugins/litespeed-cache/src/crawler.cls.php
<?php
/**
* The crawler class.
*
* @package LiteSpeed
* @since 1.1.0
*/
namespace LiteSpeed;
defined( 'WPINC' ) || exit();
/**
* Handles sitemap crawling, blacklisting, and async operations.
*/
class Crawler extends Root {
const LOG_TAG = 'πΈοΈ';
const TYPE_REFRESH_MAP = 'refresh_map';
const TYPE_EMPTY = 'empty';
const TYPE_BLACKLIST_EMPTY = 'blacklist_empty';
const TYPE_BLACKLIST_DEL = 'blacklist_del';
const TYPE_BLACKLIST_ADD = 'blacklist_add';
const TYPE_START = 'start';
const TYPE_RESET = 'reset';
const USER_AGENT = 'lscache_walker';
const FAST_USER_AGENT = 'lscache_runner';
const CHUNKS = 10000;
const STATUS_WAIT = 'W';
const STATUS_HIT = 'H';
const STATUS_MISS = 'M';
const STATUS_BLACKLIST = 'B';
const STATUS_NOCACHE = 'N';
/**
* Sitemeta file slug.
*
* @var string
*/
private $_sitemeta = 'meta.data';
/**
* Reset file full path.
*
* @var string
*/
private $_resetfile;
/**
* Reason that ended current run.
*
* @var string
*/
private $_end_reason;
/**
* Number of CPU cores.
*
* @var int
*/
private $_ncpu = 1;
/**
* Server IP set in settings.
*
* @var string|null
*/
private $_server_ip;
/**
* Crawler configuration.
*
* @var array
*/
private $_crawler_conf = [
'cookies' => [],
'headers' => [],
'ua' => '',
];
/**
* Built crawler variants.
*
* @var array<int,array>
*/
private $_crawlers = [];
/**
* Current allowed worker threads.
*
* @var int
*/
private $_cur_threads = -1;
/**
* Max timestamp to run until.
*
* @var int
*/
private $_max_run_time;
/**
* Last time threads were adjusted.
*
* @var int
*/
private $_cur_thread_time;
/**
* Map-status list to batch-save.
*
* @var array
*/
private $_map_status_list = [
'H' => [],
'M' => [],
'B' => [],
'N' => [],
];
/**
* Summary cache.
*
* @var array
*/
protected $_summary;
/**
* Initialize crawler, assign sitemap path.
*
* @since 1.1.0
*/
public function __construct() {
if ( is_multisite() ) {
$this->_sitemeta = 'meta' . get_current_blog_id() . '.data';
}
$this->_resetfile = LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta . '.reset';
$this->_summary = self::get_summary();
$this->_ncpu = $this->_get_server_cpu();
$this->_server_ip = $this->conf( Base::O_SERVER_IP );
self::debug( 'Init w/ CPU cores=' . $this->_ncpu );
}
/**
* Try get server CPUs.
*
* @since 5.2
* @return int Number of cores detected.
*/
private function _get_server_cpu() {
$cpuinfo_file = '/proc/cpuinfo';
$setting_open_dir = ini_get( 'open_basedir' );
if ( $setting_open_dir ) {
return 1; // Server has limit.
}
try {
// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
if (!@is_file($cpuinfo_file)) {
return 1;
}
} catch ( \Exception $e ) {
return 1;
}
// Local system read; no WP alternative. Suppress sniff.
// phpcs:ignore WordPress.WP.AlternativeFunctions.file_get_contents_file_get_contents
$cpuinfo = file_get_contents( $cpuinfo_file );
preg_match_all( '/^processor/m', $cpuinfo, $matches );
$cnt = isset( $matches[0] ) ? count( $matches[0] ) : 0;
return $cnt ? $cnt : 1;
}
/**
* Check whether the current crawler is active.
*
* @since 4.3
* @param int $curr Crawler index.
* @return bool Active state.
*/
public function is_active( $curr ) {
$bypass_list = self::get_option( 'bypass_list', [] );
return ! in_array( (int) $curr, $bypass_list, true );
}
/**
* Toggle the current crawler's active state and return the updated state.
*
* @since 4.3
* @param int $curr Crawler index.
* @return bool True if turned on, false if turned off.
*/
public function toggle_activeness( $curr ) {
$bypass_list = self::get_option( 'bypass_list', [] );
if ( in_array( (int) $curr, $bypass_list, true ) ) {
// Remove it.
$key = array_search( (int) $curr, $bypass_list, true );
if ( false !== $key ) {
unset( $bypass_list[ $key ] );
$bypass_list = array_values( $bypass_list );
self::update_option( 'bypass_list', $bypass_list );
}
return true;
}
// Add it.
$bypass_list[] = (int) $curr;
self::update_option( 'bypass_list', $bypass_list );
return false;
}
/**
* Clear bypassed list.
*
* @since 4.3
* @access public
* @return void
*/
public function clear_disabled_list() {
self::update_option( 'bypass_list', [] );
$msg = __( 'Crawler disabled list is cleared! All crawlers are set to active! ', 'litespeed-cache' );
Admin_Display::note( $msg );
self::debug( 'All crawlers are set to active...... ' );
}
/**
* Overwrite get_summary to init elements.
*
* @since 3.0
* @access public
*
* @param string|false $field Field name to fetch or false to get all.
* @return mixed Summary value/array or null if not found.
*/
public static function get_summary( $field = false ) {
$_default = [
'list_size' => 0,
'last_update_time' => 0,
'curr_crawler' => 0,
'curr_crawler_beginning_time' => 0,
'last_pos' => 0,
'last_count' => 0,
'last_crawled' => 0,
'last_start_time' => 0,
'last_status' => '',
'is_running' => 0,
'end_reason' => '',
'meta_save_time' => 0,
'pos_reset_check' => 0,
'done' => 0,
'this_full_beginning_time' => 0,
'last_full_time_cost' => 0,
'last_crawler_total_cost' => 0,
'crawler_stats' => [], // this will store all crawlers hit/miss crawl status.
];
wp_cache_delete( 'alloptions', 'options' ); // ensure the summary is current.
$summary = parent::get_summary();
$summary = array_merge( $_default, $summary );
if ( false === $field ) {
return $summary;
}
if ( array_key_exists( $field, $summary ) ) {
return $summary[ $field ];
}
return null;
}
/**
* Overwrite save_summary.
*
* @since 3.0
* @access public
*
* @param array|false $data Data to save or false to save current.
* @param bool $reload Whether to reload after saving.
* @param bool $overwrite Whether to overwrite completely.
* @return void
*/
public static function save_summary( $data = false, $reload = false, $overwrite = false ) {
$instance = self::cls();
$instance->_summary['meta_save_time'] = time();
if ( false === $data ) {
$data = $instance->_summary;
}
parent::save_summary( $data, $reload, $overwrite );
File::save( LITESPEED_STATIC_DIR . '/crawler/' . $instance->_sitemeta, wp_json_encode( $data ), true );
}
/**
* Cron start async crawling.
*
* @since 5.5
* @return void
*/
public static function start_async_cron() {
Task::async_call( 'crawler' );
}
/**
* Manually start async crawling.
*
* @since 5.5
* @return void
*/
public static function start_async() {
Task::async_call( 'crawler_force' );
$msg = __( 'Started async crawling', 'litespeed-cache' );
Admin_Display::success( $msg );
}
/**
* Ajax crawl handler.
*
* @since 5.5
* @param bool $manually_run Whether manually triggered.
* @return void
*/
public static function async_handler( $manually_run = false ) {
self::debug( '------------async-------------start_async_handler' );
self::start( (bool) $manually_run );
}
/**
* Proceed crawling.
*
* @since 1.1.0
* @access public
*
* @param bool $manually_run Whether manually triggered.
* @return bool|void
*/
public static function start( $manually_run = false ) {
if ( ! Router::can_crawl() ) {
self::debug( '......crawler is NOT allowed by the server admin......' );
return false;
}
if ( $manually_run ) {
self::debug( '......crawler manually ran......' );
}
self::cls()->_crawl_data( (bool) $manually_run );
}
/**
* Crawling start.
*
* @since 1.1.0
* @access private
*
* @param bool $manually_run Whether manually triggered.
* @return void
*/
private function _crawl_data( $manually_run ) {
if ( ! defined( 'LITESPEED_LANE_HASH' ) ) {
define( 'LITESPEED_LANE_HASH', Str::rrand( 8 ) );
}
if ( $this->_check_valid_lane() ) {
$this->_take_over_lane();
} else {
self::debug( 'β οΈ lane in use' );
return;
}
self::debug( '......crawler started......' );
// for the first time running.
if ( ! $this->_summary || ! Data::cls()->tb_exist( 'crawler' ) || ! Data::cls()->tb_exist( 'crawler_blacklist' ) ) {
$this->cls( 'Crawler_Map' )->gen();
}
// if finished last time, regenerate sitemap.
if ( 'touchedEnd' === $this->_summary['done'] ) {
// check whole crawling interval.
$last_finished_at = (int) $this->_summary['last_full_time_cost'] + (int) $this->_summary['this_full_beginning_time'];
if ( ! $manually_run && ( time() - $last_finished_at ) < $this->conf( Base::O_CRAWLER_CRAWL_INTERVAL ) ) {
self::debug( 'Cron abort: cache warmed already.' );
$this->Release_lane();
return;
}
self::debug( 'TouchedEnd. regenerate sitemap....' );
$this->cls( 'Crawler_Map' )->gen();
}
$crawlers = $this->list_crawlers();
$crawlers_count = count( $crawlers );
// Skip the crawlers that in bypassed list.
while ( ! $this->is_active( $this->_summary['curr_crawler'] ) && $this->_summary['curr_crawler'] < $crawlers_count ) {
self::debug( 'Skipped the Crawler #' . $this->_summary['curr_crawler'] . ' ......' );
$this->_summary['curr_crawler'] = (int) $this->_summary['curr_crawler'] + 1;
}
if ( $this->_summary['curr_crawler'] >= $crawlers_count ) {
$this->_end_reason = 'end';
$this->_terminate_running();
$this->Release_lane();
return;
}
// In case crawlers are all done but not reload, reload it.
if ( empty( $this->_summary['curr_crawler'] ) || empty( $this->_crawlers[ $this->_summary['curr_crawler'] ] ) ) {
$this->_summary['curr_crawler'] = 0;
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ] = [];
}
$res = $this->load_conf();
if ( ! $res ) {
self::debug( 'Load conf failed' );
$this->_terminate_running();
$this->Release_lane();
return;
}
try {
$this->_engine_start();
$this->Release_lane();
} catch ( \Exception $e ) {
self::debug( 'π ' . $e->getMessage() );
}
}
/**
* Load conf before running crawler.
*
* @since 3.0
* @access private
* @return bool True on success.
*/
private function load_conf() {
$this->_crawler_conf['base'] = site_url();
$current_crawler = $this->_crawlers[ $this->_summary['curr_crawler'] ];
// Cookies.
foreach ( $current_crawler as $k => $v ) {
if ( 0 !== strpos( $k, 'cookie:' ) ) {
continue;
}
if ( '_null' === $v ) {
continue;
}
$this->_crawler_conf['cookies'][ substr( $k, 7 ) ] = $v;
}
// WebP/AVIF simulation.
if ( ! empty( $current_crawler['webp'] ) ) {
$this->_crawler_conf['headers'][] = 'Accept: image/' . ( 2 === (int) $this->conf( Base::O_IMG_OPTM_WEBP ) ? 'avif' : 'webp' ) . ',*/*';
}
// Mobile crawler.
if ( ! empty( $current_crawler['mobile'] ) ) {
$this->_crawler_conf['ua'] = 'Mobile iPhone';
}
// Limit delay to use server setting.
$this->_crawler_conf['run_delay'] = 500; // microseconds.
if ( defined( 'LITESPEED_CRAWLER_USLEEP' ) && constant( 'LITESPEED_CRAWLER_USLEEP' ) > $this->_crawler_conf['run_delay'] ) {
$this->_crawler_conf['run_delay'] = (int) constant( 'LITESPEED_CRAWLER_USLEEP' );
}
if ( isset( $_SERVER[ Base::ENV_CRAWLER_USLEEP ] ) ) {
$env_usleep = absint( wp_unslash( $_SERVER[ Base::ENV_CRAWLER_USLEEP ] ) );
if ( $env_usleep > (int) $this->_crawler_conf['run_delay'] ) {
$this->_crawler_conf['run_delay'] = $env_usleep;
}
}
$this->_crawler_conf['run_duration'] = $this->get_crawler_duration();
$this->_crawler_conf['load_limit'] = (int) $this->conf( Base::O_CRAWLER_LOAD_LIMIT );
if ( isset( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE ] ) ) {
$this->_crawler_conf['load_limit'] = absint( wp_unslash( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE ] ) );
} elseif ( isset( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT ] ) ) {
$env_limit = absint( wp_unslash( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT ] ) );
if ( $env_limit < (int) $this->_crawler_conf['load_limit'] ) {
$this->_crawler_conf['load_limit'] = $env_limit;
}
}
if ( 0 === (int) $this->_crawler_conf['load_limit'] ) {
self::debug( 'π Terminated crawler due to load limit set to 0' );
return false;
}
// Role simulation.
if ( ! empty( $current_crawler['uid'] ) ) {
if ( empty( $this->_server_ip ) ) {
self::debug( 'π Terminated crawler due to Server IP not set' );
return false;
}
$vary_name = $this->cls( 'Vary' )->get_vary_name();
$vary_val = $this->cls( 'Vary' )->finalize_default_vary( $current_crawler['uid'] );
$this->_crawler_conf['cookies'][ $vary_name ] = $vary_val;
$this->_crawler_conf['cookies']['litespeed_hash'] = Router::cls()->get_hash( $current_crawler['uid'] );
}
return true;
}
/**
* Get crawler duration allowance.
*
* @since 7.0
* @return int Seconds.
*/
public function get_crawler_duration() {
$run_duration = defined( 'LITESPEED_CRAWLER_DURATION' ) ? (int) constant( 'LITESPEED_CRAWLER_DURATION' ) : 900;
if ( $run_duration > 900 ) {
$run_duration = 900; // reset to default value if defined higher than 900 seconds.
}
return $run_duration;
}
/**
* Start crawler.
*
* @since 1.1.0
* @access private
* @return void
*/
private function _engine_start() {
// check current load.
$this->_adjust_current_threads();
if ( 0 === (int) $this->_cur_threads ) {
$this->_end_reason = 'stopped_highload';
self::debug( 'Stopped due to heavy load.' );
return;
}
// log started time.
self::save_summary( [ 'last_start_time' => time() ] );
// set time limit.
$max_time = (int) ini_get( 'max_execution_time' );
self::debug( 'ini_get max_execution_time=' . $max_time );
if ( 0 === $max_time ) {
$max_time = 300; // hardlimit.
} else {
$max_time -= 5;
}
if ( $max_time >= (int) $this->_crawler_conf['run_duration'] ) {
$max_time = (int) $this->_crawler_conf['run_duration'];
self::debug( 'Use run_duration setting as max_execution_time=' . $max_time );
// phpcs:ignore WordPress.PHP.IniSet.max_execution_time_Disallowed -- Required for crawler functionality.
} elseif ( ini_set( 'max_execution_time', $this->_crawler_conf['run_duration'] + 15 ) !== false ) {
$max_time = $this->_crawler_conf['run_duration'];
self::debug( 'ini_set max_execution_time=' . $max_time );
}
self::debug( 'final max_execution_time=' . $max_time );
$this->_max_run_time = $max_time + time();
// mark running.
$this->_prepare_running();
// run crawler.
$this->_do_running();
$this->_terminate_running();
}
/**
* Get server load.
*
* @since 5.5
* @return int Load or -1 if unsupported.
*/
public function get_server_load() {
if ( ! function_exists( 'sys_getloadavg' ) ) {
return -1;
}
$curload = sys_getloadavg();
$curload = (float) $curload[0];
self::debug( 'Server load: ' . $curload );
return $curload;
}
/**
* Adjust threads dynamically.
*
* @since 1.1.0
* @access private
* @return void
*/
private function _adjust_current_threads() {
$curload = $this->get_server_load();
if ( -1 === (int) $curload ) {
self::debug( 'set threads=0 due to func sys_getloadavg not exist!' );
$this->_cur_threads = 0;
return;
}
$curload /= (float) $this->_ncpu;
$crawler_threads = defined( 'LITESPEED_CRAWLER_THREADS' ) ? (int) constant( 'LITESPEED_CRAWLER_THREADS' ) : 3;
$load_limit = (float) $this->_crawler_conf['load_limit'];
$current_threads = (int) $this->_cur_threads;
if ( -1 === $current_threads ) {
// init.
if ( $curload > $load_limit ) {
$curthreads = 0;
} elseif ( $curload >= ( $load_limit - 1 ) ) {
$curthreads = 1;
} else {
$curthreads = (int) ( $load_limit - $curload );
if ( $curthreads > $crawler_threads ) {
$curthreads = $crawler_threads;
}
}
} else {
// adjust.
$curthreads = $current_threads;
if ( $curload >= ( $load_limit + 1 ) ) {
sleep( 5 ); // sleep 5 secs.
if ( $curthreads >= 1 ) {
--$curthreads;
}
} elseif ( $curload >= $load_limit ) {
--$curthreads;
} elseif ( ( $curload + 1 ) < $load_limit ) {
if ( $curthreads < $crawler_threads ) {
++$curthreads;
}
}
}
$this->_cur_threads = (int) $curthreads;
$this->_cur_thread_time = time();
}
/**
* Mark running status.
*
* @since 1.1.0
* @access private
* @return void
*/
private function _prepare_running() {
$this->_summary['is_running'] = time();
$this->_summary['done'] = 0; // reset done status.
$this->_summary['last_status'] = 'prepare running';
$this->_summary['last_crawled'] = 0;
// Current crawler starttime mark.
if ( 0 === (int) $this->_summary['last_pos'] ) {
$this->_summary['curr_crawler_beginning_time'] = time();
}
if ( 0 === (int) $this->_summary['curr_crawler'] && 0 === (int) $this->_summary['last_pos'] ) {
$this->_summary['this_full_beginning_time'] = time();
$this->_summary['list_size'] = $this->cls( 'Crawler_Map' )->count_map();
}
if ( 'end' === $this->_summary['end_reason'] && 0 === (int) $this->_summary['last_pos'] ) {
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ] = [];
}
self::save_summary();
}
/**
* Take over lane.
*
* @since 6.1
* @return void
*/
private function _take_over_lane() {
self::debug( 'Take over lane as lane is free: ' . $this->json_local_path() . '.pid' );
File::save( $this->json_local_path() . '.pid', LITESPEED_LANE_HASH );
}
/**
* Update lane file mtime.
*
* @since 6.1
* @return void
*/
private function _touch_lane() {
// phpcs:ignore WordPress.WP.AlternativeFunctions.file_system_operations_touch
touch( $this->json_local_path() . '.pid' );
}
/**
* Release lane file.
*
* @since 6.1
* @return void
*/
public function Release_lane() {
$lane_file = $this->json_local_path() . '.pid';
if ( ! file_exists( $lane_file ) ) {
return;
}
self::debug( 'Release lane' );
// phpcs:ignore WordPress.WP.AlternativeFunctions.unlink_unlink
unlink( $lane_file );
}
/**
* Check if lane is used by other crawlers.
*
* @since 6.1
* @param bool $strict_mode Strict check that file must exist.
* @return bool True if valid lane.
*/
private function _check_valid_lane( $strict_mode = false ) {
$lane_file = $this->json_local_path() . '.pid';
if ( $strict_mode ) {
if ( ! file_exists( $lane_file ) ) {
self::debug( 'lane file not existed, strict mode is false [file] ' . $lane_file );
return false;
}
}
$pid = File::read( $lane_file );
if ( $pid && LITESPEED_LANE_HASH !== $pid ) {
// If lane file is older than 1h, ignore.
if ( ( time() - filemtime( $lane_file ) ) > 3600 ) {
self::debug( 'Lane file is older than 1h, releasing lane' );
$this->Release_lane();
return true;
}
return false;
}
return true;
}
/**
* Test port for simulator.
*
* @since 7.0
* @access private
* @return bool true if success and can continue crawling, false otherwise.
*/
private function _test_port() {
if ( empty( $this->_server_ip ) ) {
if ( empty( $this->_crawlers[ $this->_summary['curr_crawler'] ]['uid'] ) ) {
self::debug( 'Bypass test port as Server IP is not set' );
return true;
}
self::debug( 'β Server IP not set' );
return false;
}
if ( defined( 'LITESPEED_CRAWLER_LOCAL_PORT' ) ) {
self::debug( 'β
LITESPEED_CRAWLER_LOCAL_PORT already defined' );
return true;
}
// Don't repeat testing in 120s.
if ( ! empty( $this->_summary['test_port_tts'] ) && ( time() - (int) $this->_summary['test_port_tts'] ) < 120 ) {
if ( ! empty( $this->_summary['test_port'] ) ) {
self::debug( 'β
Use tested local port: ' . $this->_summary['test_port'] );
define( 'LITESPEED_CRAWLER_LOCAL_PORT', (int) $this->_summary['test_port'] );
return true;
}
return false;
}
$this->_summary['test_port_tts'] = time();
self::save_summary();
$options = $this->_get_curl_options();
$home = home_url();
File::save( LITESPEED_STATIC_DIR . '/crawler/test_port.html', $home, true );
$url = LITESPEED_STATIC_URL . '/crawler/test_port.html';
$parsed_url = wp_parse_url( $url );
if ( empty( $parsed_url['host'] ) ) {
self::debug( 'β Test port failed, invalid URL: ' . $url );
return false;
}
$resolved = $parsed_url['host'] . ':443:' . $this->_server_ip;
$options[ CURLOPT_RESOLVE ] = [ $resolved ];
$options[ CURLOPT_DNS_USE_GLOBAL_CACHE ] = false;
$options[ CURLOPT_HEADER ] = false;
self::debug( 'Test local 443 port for ' . $resolved );
// cURL is intentionally used for speed; suppress sniffs in this method.
// phpcs:disable WordPress.WP.AlternativeFunctions
$ch = curl_init();
curl_setopt_array( $ch, $options );
curl_setopt( $ch, CURLOPT_URL, $url );
$result = curl_exec( $ch );
$test_result = false;
if ( curl_errno( $ch ) || $result !== $home ) {
if ( curl_errno( $ch ) ) {
self::debug( 'β Test port curl error: [errNo] ' . curl_errno( $ch ) . ' [err] ' . curl_error( $ch ) );
} elseif ( $result !== $home ) {
self::debug( 'β Test port response is wrong: ' . $result );
}
self::debug( 'β Test local 443 port failed, try port 80' );
// Try port 80.
$resolved = $parsed_url['host'] . ':80:' . $this->_server_ip;
$options[ CURLOPT_RESOLVE ] = [ $resolved ];
$url = str_replace( 'https://', 'http://', $url );
if ( empty( $options[ CURLOPT_HTTPHEADER ] ) || ! in_array( 'X-Forwarded-Proto: https', $options[ CURLOPT_HTTPHEADER ], true ) ) {
$options[ CURLOPT_HTTPHEADER ][] = 'X-Forwarded-Proto: https';
}
$ch = curl_init();
curl_setopt_array( $ch, $options );
curl_setopt( $ch, CURLOPT_URL, $url );
$result = curl_exec( $ch );
if ( curl_errno( $ch ) ) {
self::debug( 'β Test port curl error: [errNo] ' . curl_errno( $ch ) . ' [err] ' . curl_error( $ch ) );
} elseif ( $result !== $home ) {
self::debug( 'β Test port response is wrong: ' . $result );
} else {
self::debug( 'β
Test local 80 port successfully' );
define( 'LITESPEED_CRAWLER_LOCAL_PORT', 80 );
$this->_summary['test_port'] = 80;
$test_result = true;
}
} else {
self::debug( 'β
Tested local 443 port successfully' );
define( 'LITESPEED_CRAWLER_LOCAL_PORT', 443 );
$this->_summary['test_port'] = 443;
$test_result = true;
}
self::save_summary();
curl_close( $ch );
// phpcs:enable
return $test_result;
}
/**
* Run crawler.
*
* @since 1.1.0
* @access private
* @return void
* @throws \Exception When lane becomes invalid during run.
*/
private function _do_running() {
$options = $this->_get_curl_options( true );
// If is role simulator and not defined local port, check port once.
$test_result = $this->_test_port();
if ( ! $test_result ) {
$this->_end_reason = 'port_test_failed';
self::debug( 'β Test port failed, crawler stopped.' );
return;
}
while ( true ) {
$url_chunks = $this->cls( 'Crawler_Map' )->list_map( self::CHUNKS, $this->_summary['last_pos'] );
if ( empty( $url_chunks ) ) {
break;
}
$url_chunks = array_chunk( $url_chunks, (int) $this->_cur_threads );
foreach ( $url_chunks as $rows ) {
if ( ! $this->_check_valid_lane( true ) ) {
$this->_end_reason = 'lane_invalid';
self::debug( 'π The crawler lane is used by newer crawler.' );
throw new \Exception( 'invalid crawler lane' );
}
// Update time.
$this->_touch_lane();
// multi curl.
$rets = $this->_multi_request( $rows, $options );
// check result headers.
foreach ( $rows as $row ) {
if ( empty( $rets[ $row['id'] ] ) ) {
continue;
}
if ( 428 === (int) $rets[ $row['id'] ]['code'] ) {
// HTTP/1.1 428 Precondition Required (need to test)
$this->_end_reason = 'crawler_disabled';
self::debug( 'crawler_disabled' );
return;
}
$status = $this->_status_parse( $rets[ $row['id'] ]['header'], $rets[ $row['id'] ]['code'], $row['url'] ); // B or H or M or N(nocache).
self::debug( '[status] ' . $this->_status2title( $status ) . "\t\t [url] " . $row['url'] );
$this->_map_status_list[ $status ][ $row['id'] ] = [
'url' => $row['url'],
'code' => (int) $rets[ $row['id'] ]['code'], // 201 or 200 or 404.
];
if ( empty( $this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ][ $status ] ) ) {
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ][ $status ] = 0;
}
++$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ][ $status ];
}
// update offset position.
$_time = time();
$this->_summary['last_count'] = count( $rows );
$this->_summary['last_pos'] += $this->_summary['last_count'];
$this->_summary['last_crawled'] += $this->_summary['last_count'];
$this->_summary['last_update_time'] = $_time;
$this->_summary['last_status'] = 'updated position';
// check duration.
if ( $this->_summary['last_update_time'] > $this->_max_run_time ) {
$this->_end_reason = 'stopped_maxtime';
self::debug( 'Terminated due to maxtime' );
return;
}
// make sure at least each 10s save meta & map status once.
if ( $_time - $this->_summary['meta_save_time'] > 10 ) {
$this->_map_status_list = $this->cls( 'Crawler_Map' )->save_map_status( $this->_map_status_list, $this->_summary['curr_crawler'] );
self::save_summary();
}
// check if need to reset pos each 5s.
if ( $_time > $this->_summary['pos_reset_check'] ) {
$this->_summary['pos_reset_check'] = $_time + 5;
if ( file_exists( $this->_resetfile ) && unlink( $this->_resetfile ) ) { // phpcs:ignore WordPress.WP.AlternativeFunctions.unlink_unlink
self::debug( 'Terminated due to reset file' );
$this->_summary['last_pos'] = 0;
$this->_summary['curr_crawler'] = 0;
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ] = [];
// reset done status.
$this->_summary['done'] = 0;
$this->_summary['this_full_beginning_time'] = 0;
$this->_end_reason = 'stopped_reset';
return;
}
}
// check loads.
if ( ( $this->_summary['last_update_time'] - $this->_cur_thread_time ) > 60 ) {
$this->_adjust_current_threads();
if ( 0 === (int) $this->_cur_threads ) {
$this->_end_reason = 'stopped_highload';
self::debug( 'π Terminated due to highload' );
return;
}
}
$this->_summary['last_status'] = 'sleeping ' . (int) $this->_crawler_conf['run_delay'] . 'ms';
usleep( (int) $this->_crawler_conf['run_delay'] );
}
}
// All URLs are done for current crawler.
$this->_end_reason = 'end';
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ]['W'] = 0;
self::debug( 'Crawler #' . $this->_summary['curr_crawler'] . ' touched end' );
}
/**
* If need to resolve DNS or not.
*
* @since 7.3.0.1
* @return bool
*/
private function _should_force_resolve_dns() {
if ( ! empty( $this->_server_ip ) ) {
return true;
}
if ( ! empty( $this->_crawler_conf['cookies'] ) && ! empty( $this->_crawler_conf['cookies']['litespeed_hash'] ) ) {
return true;
}
return false;
}
/**
* Send multi curl requests.
* If res=B/N, bypass request and won't return.
*
* @since 1.1.0
* @access private
*
* @param array<int,array<string,mixed>> $rows Rows to crawl.
* @param array $options cURL options.
* @return array<int,array{header:string,code:int}>
*/
private function _multi_request( $rows, $options ) {
if ( ! function_exists( 'curl_multi_init' ) ) {
exit( 'curl_multi_init disabled' );
}
// phpcs:disable WordPress.WP.AlternativeFunctions
$mh = curl_multi_init();
$crawler_drop_domain = defined( 'LITESPEED_CRAWLER_DROP_DOMAIN' ) ? (bool) constant( 'LITESPEED_CRAWLER_DROP_DOMAIN' ) : false;
$curls = [];
foreach ( $rows as $row ) {
if ( self::STATUS_BLACKLIST === substr( $row['res'], $this->_summary['curr_crawler'], 1 ) ) {
continue;
}
if ( self::STATUS_NOCACHE === substr( $row['res'], $this->_summary['curr_crawler'], 1 ) ) {
continue;
}
if (!function_exists('curl_init')) {
exit('curl_init disabled');
}
$curls[$row['id']] = curl_init();
// Append URL.
$url = $row['url'];
if ( $crawler_drop_domain ) {
$url = $this->_crawler_conf['base'] . $row['url'];
}
// IP resolve.
if ( $this->_should_force_resolve_dns() ) {
$parsed_url = wp_parse_url( $url );
if ( ! empty( $parsed_url['host'] ) ) {
$dom = $parsed_url['host'];
$port = defined( 'LITESPEED_CRAWLER_LOCAL_PORT' ) ? (int) LITESPEED_CRAWLER_LOCAL_PORT : 443;
$resolved = $dom . ':' . $port . ':' . $this->_server_ip;
$options[ CURLOPT_RESOLVE ] = [ $resolved ];
$options[ CURLOPT_DNS_USE_GLOBAL_CACHE ] = false;
if ( 80 === $port ) {
$url = str_replace( 'https://', 'http://', $url );
if ( empty( $options[ CURLOPT_HTTPHEADER ] ) || ! in_array( 'X-Forwarded-Proto: https', $options[ CURLOPT_HTTPHEADER ], true ) ) {
$options[ CURLOPT_HTTPHEADER ][] = 'X-Forwarded-Proto: https';
}
}
self::debug( 'Resolved DNS for ' . $resolved );
}
}
curl_setopt( $curls[ $row['id'] ], CURLOPT_URL, $url );
self::debug( 'Crawling [url] ' . $url . ( $url === $row['url'] ? '' : ' [ori] ' . $row['url'] ) );
curl_setopt_array( $curls[ $row['id'] ], $options );
curl_multi_add_handle( $mh, $curls[ $row['id'] ] );
}
// execute curl.
if ( $curls ) {
do {
$status = curl_multi_exec( $mh, $active );
if ( $active ) {
curl_multi_select( $mh );
}
} while ( $active && CURLM_OK === $status );
}
// curl done.
$ret = [];
foreach ( $rows as $row ) {
if ( self::STATUS_BLACKLIST === substr( $row['res'], $this->_summary['curr_crawler'], 1 ) ) {
continue;
}
if ( self::STATUS_NOCACHE === substr( $row['res'], $this->_summary['curr_crawler'], 1 ) ) {
continue;
}
$ch = $curls[ $row['id'] ];
// Parse header.
$header_size = curl_getinfo( $ch, CURLINFO_HEADER_SIZE );
$content = curl_multi_getcontent( $ch );
$header = substr( $content, 0, $header_size );
$ret[ $row['id'] ] = [
'header' => $header,
'code' => (int) curl_getinfo( $ch, CURLINFO_HTTP_CODE ),
];
curl_multi_remove_handle( $mh, $ch );
curl_close( $ch );
}
curl_multi_close( $mh );
// phpcs:enable
return $ret;
}
/**
* Translate the status to title.
*
* @since 6.0
* @param string $status Status char.
* @return string Human title.
*/
private function _status2title( $status ) {
if ( self::STATUS_HIT === $status ) {
return 'β
Hit';
}
if ( self::STATUS_MISS === $status ) {
return 'π Miss';
}
if ( self::STATUS_BLACKLIST === $status ) {
return 'π
Blacklisted';
}
if ( self::STATUS_NOCACHE === $status ) {
return 'π
Blacklisted';
}
return 'πΈ Unknown';
}
/**
* Check returned curl header to find if cached or not.
*
* @since 2.0
* @access private
*
* @param string $header Response header.
* @param int $code HTTP code.
* @param string $url URL.
* @return string One of status chars.
*/
private function _status_parse( $header, $code, $url ) {
if ( 201 === (int) $code ) {
return self::STATUS_HIT;
}
if ( false !== stripos( $header, 'X-Litespeed-Cache-Control: no-cache' ) ) {
// If is from DIVI, taken as miss.
if ( defined( 'LITESPEED_CRAWLER_IGNORE_NONCACHEABLE' ) && constant( 'LITESPEED_CRAWLER_IGNORE_NONCACHEABLE' ) ) {
return self::STATUS_MISS;
}
// If blacklist is disabled.
if ( ( defined( 'LITESPEED_CRAWLER_DISABLE_BLOCKLIST' ) && constant( 'LITESPEED_CRAWLER_DISABLE_BLOCKLIST' ) ) || apply_filters( 'litespeed_crawler_disable_blocklist', false, $url ) ) {
return self::STATUS_MISS;
}
return self::STATUS_NOCACHE; // Blacklist.
}
$_cache_headers = [ 'x-litespeed-cache', 'x-qc-cache', 'x-lsadc-cache' ];
foreach ( $_cache_headers as $_header ) {
if ( false !== stripos( $header, $_header ) ) {
if ( false !== stripos( $header, $_header . ': bkn' ) ) {
return self::STATUS_HIT; // Hit.
}
if ( false !== stripos( $header, $_header . ': miss' ) ) {
return self::STATUS_MISS; // Miss.
}
return self::STATUS_HIT; // Hit.
}
}
// If blacklist is disabled.
if ( ( defined( 'LITESPEED_CRAWLER_DISABLE_BLOCKLIST' ) && constant( 'LITESPEED_CRAWLER_DISABLE_BLOCKLIST' ) ) || apply_filters( 'litespeed_crawler_disable_blocklist', false, $url ) ) {
return self::STATUS_MISS;
}
return self::STATUS_BLACKLIST; // Blacklist.
}
/**
* Get curl options.
*
* @since 1.1.0
* @access private
*
* @param bool $crawler_only Whether crawler-only UA.
* @return array
*/
private function _get_curl_options( $crawler_only = false ) {
$crawler_timeout = defined( 'LITESPEED_CRAWLER_TIMEOUT' ) ? (int) constant( 'LITESPEED_CRAWLER_TIMEOUT' ) : 30;
$options = [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_CUSTOMREQUEST => 'GET',
CURLOPT_FOLLOWLOCATION => false,
CURLOPT_ENCODING => 'gzip',
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_TIMEOUT => $crawler_timeout, // Larger timeout to avoid incorrect blacklist addition #900171.
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_NOBODY => false,
CURLOPT_HTTPHEADER => $this->_crawler_conf['headers'],
];
$options[ CURLOPT_HTTPHEADER ][] = 'Cache-Control: max-age=0';
$options[ CURLOPT_HTTP_VERSION ] = CURL_HTTP_VERSION_1_1;
// if is walker
// $options[ CURLOPT_FRESH_CONNECT ] = true;
// Referer.
if ( isset( $_SERVER['HTTP_HOST'], $_SERVER['REQUEST_URI'] ) ) {
$host = sanitize_text_field( wp_unslash( $_SERVER['HTTP_HOST'] ) );
$uri = sanitize_text_field( wp_unslash( $_SERVER['REQUEST_URI'] ) );
$options[ CURLOPT_REFERER ] = 'http://' . $host . $uri;
}
// User Agent.
if ( $crawler_only ) {
if ( 0 !== strpos( (string) $this->_crawler_conf['ua'], self::FAST_USER_AGENT ) ) {
$this->_crawler_conf['ua'] = self::FAST_USER_AGENT . ' ' . (string) $this->_crawler_conf['ua'];
}
}
$options[ CURLOPT_USERAGENT ] = (string) $this->_crawler_conf['ua'];
// Cookies.
$cookies = [];
foreach ( $this->_crawler_conf['cookies'] as $k => $v ) {
if ( ! $v ) {
continue;
}
$cookies[] = $k . '=' . rawurlencode( $v );
}
if ( $cookies ) {
$options[ CURLOPT_COOKIE ] = implode( '; ', $cookies );
}
return $options;
}
/**
* Self curl to get HTML content.
*
* @since 3.3
*
* @param string $url URL.
* @param string $ua User agent.
* @param int|false $uid Optional user ID for simulation.
* @param string|false $accept Optional Accept header value.
* @return string|false HTML on success, false on failure.
*/
public function self_curl( $url, $ua, $uid = false, $accept = false ) {
$this->_crawler_conf['base'] = site_url();
$this->_crawler_conf['ua'] = $ua;
if ( $accept ) {
$this->_crawler_conf['headers'] = [ 'Accept: ' . $accept ];
}
$options = $this->_get_curl_options();
if ( $uid ) {
$this->_crawler_conf['cookies']['litespeed_flash_hash'] = Router::cls()->get_flash_hash( $uid );
$parsed_url = wp_parse_url( $url );
if ( ! empty( $parsed_url['host'] ) ) {
$dom = $parsed_url['host'];
$port = defined( 'LITESPEED_CRAWLER_LOCAL_PORT' ) ? (int) LITESPEED_CRAWLER_LOCAL_PORT : 443;
$resolved = $dom . ':' . $port . ':' . $this->_server_ip;
$options[ CURLOPT_RESOLVE ] = [ $resolved ];
$options[ CURLOPT_DNS_USE_GLOBAL_CACHE ] = false;
$options[ CURLOPT_PORT ] = $port;
self::debug( 'Resolved DNS for ' . $resolved );
}
}
$options[ CURLOPT_HEADER ] = false;
$options[ CURLOPT_FOLLOWLOCATION ] = true;
// phpcs:disable WordPress.WP.AlternativeFunctions
$ch = curl_init();
curl_setopt_array( $ch, $options );
curl_setopt( $ch, CURLOPT_URL, $url );
$result = curl_exec( $ch );
$code = (int) curl_getinfo( $ch, CURLINFO_HTTP_CODE );
curl_close( $ch );
// phpcs:enable
if ( 200 !== $code ) {
self::debug( 'β Response code is not 200 in self_curl() [code] ' . $code );
return false;
}
return $result;
}
/**
* Terminate crawling.
*
* @since 1.1.0
* @access private
* @return void
*/
private function _terminate_running() {
$this->_map_status_list = $this->cls( 'Crawler_Map' )->save_map_status( $this->_map_status_list, $this->_summary['curr_crawler'] );
if ( 'end' === $this->_end_reason ) {
$this->_summary['curr_crawler'] = (int) $this->_summary['curr_crawler'] + 1; // Jump to next crawler.
$this->_summary['last_pos'] = 0; // reset last position.
$this->_summary['last_crawler_total_cost'] = time() - (int) $this->_summary['curr_crawler_beginning_time'];
$count_crawlers = count( $this->list_crawlers() );
if ( $this->_summary['curr_crawler'] >= $count_crawlers ) {
self::debug( '_terminate_running Touched end, whole crawled. Reload crawler!' );
$this->_summary['curr_crawler'] = 0;
$this->_summary['done'] = 'touchedEnd'; // log done status.
$this->_summary['last_full_time_cost'] = time() - (int) $this->_summary['this_full_beginning_time'];
}
}
$this->_summary['last_status'] = 'stopped';
$this->_summary['is_running'] = 0;
$this->_summary['end_reason'] = $this->_end_reason;
self::save_summary();
}
/**
* List all crawlers ( tagA => [ valueA => titleA, ... ] ... ).
*
* @since 1.9.1
* @access public
* @return array<int,array<string,mixed>>
*/
public function list_crawlers() {
if ( $this->_crawlers ) {
return $this->_crawlers;
}
$crawler_factors = [];
// Add default Guest crawler.
$crawler_factors['uid'] = [ 0 => __( 'Guest', 'litespeed-cache' ) ];
// WebP on/off.
if ( $this->conf( Base::O_IMG_OPTM_WEBP ) ) {
$crawler_factors['webp'] = [ 1 => $this->cls( 'Media' )->next_gen_image_title() ];
if ( apply_filters( 'litespeed_crawler_webp', false ) ) {
$crawler_factors['webp'][0] = '';
}
}
// Guest Mode on/off.
if ( $this->conf( Base::O_GUEST ) ) {
$vary_name = $this->cls( 'Vary' )->get_vary_name();
$vary_val = 'guest_mode:1';
if ( ! defined( 'LSCWP_LOG' ) ) {
$vary_val = md5( $this->conf( Base::HASH ) . $vary_val );
}
$crawler_factors[ 'cookie:' . $vary_name ] = [
$vary_val => '',
'_null' => '<font data-balloon-pos="up" aria-label="Guest Mode">π</font>',
];
}
// Mobile crawler.
if ( $this->conf( Base::O_CACHE_MOBILE ) ) {
$crawler_factors['mobile'] = [
1 => '<font data-balloon-pos="up" aria-label="Mobile">π±</font>',
0 => '',
];
}
// Get roles set.
foreach ( $this->conf( Base::O_CRAWLER_ROLES ) as $v ) {
$role_title = '';
$udata = get_userdata( $v );
if ( isset( $udata->roles ) && is_array( $udata->roles ) ) {
$tmp = array_values( $udata->roles );
$role_title = array_shift( $tmp );
}
if ( ! $role_title ) {
continue;
}
$crawler_factors['uid'][ $v ] = ucfirst( $role_title );
}
// Cookie crawler.
foreach ( $this->conf( Base::O_CRAWLER_COOKIES ) as $v ) {
if ( empty( $v['name'] ) ) {
continue;
}
$this_cookie_key = 'cookie:' . $v['name'];
$crawler_factors[ $this_cookie_key ] = [];
foreach ( $v['vals'] as $v2 ) {
$crawler_factors[ $this_cookie_key ][ $v2 ] =
( '_null' === $v2 ? '' : '<font data-balloon-pos="up" aria-label="Cookie">πͺ</font>' . esc_html( $v['name'] ) . '=' . esc_html( $v2 ) );
}
}
// Crossing generate the crawler list.
$this->_crawlers = $this->_recursive_build_crawler( $crawler_factors );
return $this->_crawlers;
}
/**
* Build a crawler list recursively.
*
* @since 2.8
* @access private
*
* @param array<string,array> $crawler_factors Factors.
* @param array $group Current group.
* @param int $i Factor index.
* @return array<int,array>
*/
private function _recursive_build_crawler( $crawler_factors, $group = [], $i = 0 ) {
$current_factor_keys = array_keys( $crawler_factors );
$current_factor = $current_factor_keys[ $i ];
$if_touch_end = ( $i + 1 ) >= count( $crawler_factors );
$final_list = [];
foreach ( $crawler_factors[ $current_factor ] as $k => $v ) {
$item = $group; // Don't alter $group bcos of loop usage.
$item['title'] = ! empty( $group['title'] ) ? $group['title'] : '';
if ( $v ) {
if ( $item['title'] ) {
$item['title'] .= ' - ';
}
$item['title'] .= $v;
}
$item[ $current_factor ] = $k;
if ( $if_touch_end ) {
$final_list[] = $item;
} else {
// Inception: next layer.
$final_list = array_merge( $final_list, $this->_recursive_build_crawler( $crawler_factors, $item, $i + 1 ) );
}
}
return $final_list;
}
/**
* Return crawler meta file local path.
*
* @since 6.1
* @access public
* @return string
*/
public function json_local_path() {
return LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta;
}
/**
* Return crawler meta file URL.
*
* @since 1.1.0
* @access public
* @return string|false
*/
public function json_path() {
if ( ! file_exists( LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta ) ) {
return false;
}
return LITESPEED_STATIC_URL . '/crawler/' . $this->_sitemeta;
}
/**
* Create reset pos file.
*
* @since 1.1.0
* @access public
* @return void
*/
public function reset_pos() {
File::save( $this->_resetfile, time(), true );
self::save_summary( [ 'is_running' => 0 ] );
}
/**
* Display status based by matching crawlers order.
*
* @since 3.0
* @access public
*
* @param string $status_row Status string.
* @param string $reason_set Comma separated reasons.
* @return string HTML dots.
*/
public function display_status( $status_row, $reason_set ) {
if ( ! $status_row ) {
return '';
}
$_status_list = [
'-' => 'default',
self::STATUS_MISS => 'primary',
self::STATUS_HIT => 'success',
self::STATUS_BLACKLIST => 'danger',
self::STATUS_NOCACHE => 'warning',
];
$reason_set = explode( ',', $reason_set );
$status = '';
foreach ( str_split( $status_row ) as $k => $v ) {
$reason = isset( $reason_set[ $k ] ) ? $reason_set[ $k ] : '';
if ( 'Man' === $reason ) {
$reason = __( 'Manually added to blocklist', 'litespeed-cache' );
}
if ( 'Existed' === $reason ) {
$reason = __( 'Previously existed in blocklist', 'litespeed-cache' );
}
$reason_attr = $reason ? 'data-balloon-pos="up" aria-label="' . esc_attr( $reason ) . '"' : '';
$status .= '<i class="litespeed-dot litespeed-bg-' . esc_attr( $_status_list[ $v ] ) . '" ' . $reason_attr . '>' . ( $k + 1 ) . '</i>';
}
return $status;
}
/**
* Handle all request actions from main cls.
*
* @since 3.0
* @access public
* @return void
*/
public function handler() {
$type = Router::verify_type();
switch ( $type ) {
case self::TYPE_REFRESH_MAP:
$this->cls( 'Crawler_Map' )->gen( true );
break;
case self::TYPE_EMPTY:
$this->cls( 'Crawler_Map' )->empty_map();
break;
case self::TYPE_BLACKLIST_EMPTY:
$this->cls( 'Crawler_Map' )->blacklist_empty();
break;
case self::TYPE_BLACKLIST_DEL:
// phpcs:ignore WordPress.Security.NonceVerification.Recommended, WordPress.Security.ValidatedSanitizedInput.MissingUnslash, WordPress.Security.ValidatedSanitizedInput.InputNotSanitized
if (!empty($_GET['id'])) {
// phpcs:ignore WordPress.Security.NonceVerification.Recommended
$id = absint( wp_unslash( $_GET['id'] ) );
$this->cls( 'Crawler_Map' )->blacklist_del( $id );
}
break;
case self::TYPE_BLACKLIST_ADD:
// phpcs:ignore WordPress.Security.NonceVerification.Recommended, WordPress.Security.ValidatedSanitizedInput.MissingUnslash, WordPress.Security.ValidatedSanitizedInput.InputNotSanitized
if (!empty($_GET['id'])) {
// phpcs:ignore WordPress.Security.NonceVerification.Recommended
$id = absint( wp_unslash( $_GET['id'] ) );
$this->cls( 'Crawler_Map' )->blacklist_add( $id );
}
break;
case self::TYPE_START: // Handle the ajax request to proceed crawler manually by admin.
self::start_async();
break;
case self::TYPE_RESET:
$this->reset_pos();
break;
default:
break;
}
Admin::redirect();
}
}