http.php
15.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
<?php
/*
Plugin Name: Basic HTTP
Description: Check all links that have the HTTP/HTTPS protocol.
Version: 1.0
Author: Janis Elsts
ModuleID: http
ModuleCategory: checker
ModuleContext: on-demand
ModuleLazyInit: true
ModuleClassName: blcHttpChecker
ModulePriority: -1
*/
require_once BLC_DIRECTORY_LEGACY . '/includes/token-bucket.php';
// TODO: Rewrite sub-classes as transports, not stand-alone checkers
class blcHttpChecker extends blcChecker {
/* @var blcChecker */
var $implementation = null;
/** @var blcTokenBucketList */
private $token_bucket_list;
function init() {
parent::init();
$conf = blc_get_configuration();
$this->token_bucket_list = new blcTokenBucketList(
$conf->get( 'http_throttle_rate', 3 ),
$conf->get( 'http_throttle_period', 15 ),
$conf->get( 'http_throttle_min_interval', 2 )
);
if ( function_exists( 'curl_init' ) || is_callable( 'curl_init' ) ) {
$this->implementation = new blcCurlHttp(
$this->module_id,
$this->cached_header,
$this->plugin_conf,
$this->module_manager
);
} else {
// try and use wp request method
$this->implementation = new blcWPHttp(
$this->module_id,
$this->cached_header,
$this->plugin_conf,
$this->module_manager
);
}
}
function can_check( $url, $parsed ) {
if ( isset( $this->implementation ) ) {
return $this->implementation->can_check( $url, $parsed );
} else {
return false;
}
}
function check( $url, $use_get = false ) {
global $blclog;
// Throttle requests based on the domain name.
$domain = @parse_url( $url, PHP_URL_HOST );
if ( $domain ) {
$this->token_bucket_list->takeToken( $domain );
}
$blclog->debug( 'HTTP module checking "' . $url . '"' );
return $this->implementation->check( $url, $use_get );
}
}
/**
* Base class for checkers that deal with HTTP(S) URLs.
*
* @package Broken Link Checker
* @access public
*/
class blcHttpCheckerBase extends blcChecker {
function clean_url( $url ) {
$url = html_entity_decode( $url );
$ltrm = preg_quote( json_decode( '"\u200E"' ), '/' );
$url = preg_replace(
array(
'/([\?&]PHPSESSID=\w+)$/i', // remove session ID
'/(#[^\/]*)$/', // and anchors/fragments
'/&/', // convert improper HTML entities
'/([\?&]sid=\w+)$/i', // remove another flavour of session ID
'/' . $ltrm . '/', // remove Left-to-Right marks that can show up when copying from Word.
),
array( '', '', '&', '', '' ),
$url
);
$url = trim( $url );
return $url;
}
function is_error_code( $http_code ) {
/*
"Good" response codes are anything in the 2XX range (e.g "200 OK") and redirects - the 3XX range.
HTTP 401 Unauthorized is a special case that is considered OK as well. Other errors - the 4XX range -
are treated as such. */
$good_code = ( ( $http_code >= 200 ) && ( $http_code < 400 ) ) || ( 401 === $http_code );
return ! $good_code;
}
/**
* This checker only accepts HTTP(s) links.
*
* @param string $url
* @param array|bool $parsed
* @return bool
*/
function can_check( $url, $parsed ) {
if ( ! isset( $parsed['scheme'] ) ) {
return false;
}
return in_array( strtolower( $parsed['scheme'] ), array( 'http', 'https' ) );
}
/**
* Takes an URL and replaces spaces and some other non-alphanumeric characters with their urlencoded equivalents.
*
* @param string $url
* @return string
*/
function urlencodefix( $url ) {
// TODO: Remove/fix this. Probably not a good idea to "fix" invalid URLs like that.
return preg_replace_callback(
'|[^a-z0-9\+\-\/\\#:.,;=?!&%@()$\|*~_]|i',
function( $str ) {
return rawurlencode( $str[0] );
},
$url
);
}
}
class blcCurlHttp extends blcHttpCheckerBase {
var $last_headers = '';
function check( $url, $use_get = false ) {
global $blclog;
$blclog->info( __CLASS__ . ' Checking link', $url );
$this->last_headers = '';
$url = $this->clean_url( $url );
$blclog->debug( __CLASS__ . ' Clean URL:', $url );
$result = array(
'broken' => false,
'timeout' => false,
'warning' => false,
);
$log = '';
// Get the BLC configuration. It's used below to set the right timeout values and such.
$conf = blc_get_configuration();
// Init curl.
$ch = curl_init();
$request_headers = array();
curl_setopt( $ch, CURLOPT_URL, $this->urlencodefix( $url ) );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
// Masquerade as a recent version of Chrome
$ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.34 Safari/537.36';
curl_setopt( $ch, CURLOPT_USERAGENT, $ua );
// Close the connection after the request (disables keep-alive). The plugin rate-limits requests,
// so it's likely we'd overrun the keep-alive timeout anyway.
curl_setopt( $ch, CURLOPT_FORBID_REUSE, true );
$request_headers[] = 'Connection: close';
// Add a semi-plausible referer header to avoid tripping up some bot traps
curl_setopt( $ch, CURLOPT_REFERER, home_url() );
// Redirects don't work when safe mode or open_basedir is enabled.
if ( ! blcUtility::is_safe_mode() && ! blcUtility::is_open_basedir() ) {
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
} else {
$log .= "[Warning] Could't follow the redirect URL (if any) because safemode or open base dir enabled\n";
}
// Set maximum redirects
curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
// Set the timeout
curl_setopt( $ch, CURLOPT_TIMEOUT, $conf->options['timeout'] );
curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $conf->options['timeout'] );
// Set the proxy configuration. The user can provide this in wp-config.php
if ( defined( 'WP_PROXY_HOST' ) ) {
curl_setopt( $ch, CURLOPT_PROXY, WP_PROXY_HOST );
}
if ( defined( 'WP_PROXY_PORT' ) ) {
curl_setopt( $ch, CURLOPT_PROXYPORT, WP_PROXY_PORT );
}
if ( defined( 'WP_PROXY_USERNAME' ) ) {
$auth = WP_PROXY_USERNAME;
if ( defined( 'WP_PROXY_PASSWORD' ) ) {
$auth .= ':' . WP_PROXY_PASSWORD;
}
curl_setopt( $ch, CURLOPT_PROXYUSERPWD, $auth );
}
// Make CURL return a valid result even if it gets a 404 or other error.
curl_setopt( $ch, CURLOPT_FAILONERROR, false );
$nobody = ! $use_get; // Whether to send a HEAD request (the default) or a GET request
$parts = @parse_url( $url );
if ( 'https' === $parts['scheme'] ) {
curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false ); // Required to make HTTPS URLs work.
curl_setopt( $ch, CURLOPT_SSL_VERIFYHOST, false );
// $nobody = false; //Can't use HEAD with HTTPS.
}
if ( $nobody ) {
// If possible, use HEAD requests for speed.
curl_setopt( $ch, CURLOPT_NOBODY, true );
} else {
// If we must use GET at least limit the amount of downloaded data.
$request_headers[] = 'Range: bytes=0-2048'; // 2 KB
}
// Set request headers.
if ( ! empty( $request_headers ) ) {
curl_setopt( $ch, CURLOPT_HTTPHEADER, $request_headers );
}
// Register a callback function which will process the HTTP header(s).
// It can be called multiple times if the remote server performs a redirect.
curl_setopt( $ch, CURLOPT_HEADERFUNCTION, array( $this, 'read_header' ) );
// Record request headers.
if ( defined( 'CURLINFO_HEADER_OUT' ) ) {
curl_setopt( $ch, CURLINFO_HEADER_OUT, true );
}
// Apply filter for additional options
curl_setopt_array( $ch, apply_filters( 'broken-link-checker-curl-options', array() ) );
// Execute the request
$start_time = microtime_float();
$content = curl_exec( $ch );
$measured_request_duration = microtime_float() - $start_time;
$blclog->debug( sprintf( 'HTTP request took %.3f seconds', $measured_request_duration ) );
$info = curl_getinfo( $ch );
// var_dump( $info ); die();
// Store the results
$result['http_code'] = intval( $info['http_code'] );
$result['final_url'] = $info['url'];
$result['request_duration'] = $info['total_time'];
$result['redirect_count'] = $info['redirect_count'];
// CURL doesn't return a request duration when a timeout happens, so we measure it ourselves.
// It is useful to see how long the plugin waited for the server to respond before assuming it timed out.
if ( empty( $result['request_duration'] ) ) {
$result['request_duration'] = $measured_request_duration;
}
// Determine if the link counts as "broken"
if ( 0 === absint( $result['http_code'] ) ) {
$result['broken'] = true;
$error_code = curl_errno( $ch );
$log .= sprintf( "%s [Error #%d]\n", curl_error( $ch ), $error_code );
// We only handle a couple of CURL error codes; most are highly esoteric.
// libcurl "CURLE_" constants can't be used here because some of them have
// different names or values in PHP.
switch ( $error_code ) {
case 6: // CURLE_COULDNT_RESOLVE_HOST
$result['status_code'] = BLC_LINK_STATUS_WARNING;
$result['status_text'] = __( 'Server Not Found', 'broken-link-checker' );
$result['error_code'] = 'couldnt_resolve_host';
break;
case 28: // CURLE_OPERATION_TIMEDOUT
$result['timeout'] = true;
break;
case 7: // CURLE_COULDNT_CONNECT
// More often than not, this error code indicates that the connection attempt
// timed out. This heuristic tries to distinguish between connections that fail
// due to timeouts and those that fail due to other causes.
if ( $result['request_duration'] >= 0.9 * $conf->options['timeout'] ) {
$result['timeout'] = true;
} else {
$result['status_code'] = BLC_LINK_STATUS_WARNING;
$result['status_text'] = __( 'Connection Failed', 'broken-link-checker' );
$result['error_code'] = 'connection_failed';
}
break;
default:
$result['status_code'] = BLC_LINK_STATUS_WARNING;
$result['status_text'] = __( 'Unknown Error', 'broken-link-checker' );
}
} elseif ( 999 === $result['http_code'] ) {
$result['status_code'] = BLC_LINK_STATUS_WARNING;
$result['status_text'] = __( 'Unknown Error', 'broken-link-checker' );
$result['warning'] = true;
} else {
$result['broken'] = $this->is_error_code( $result['http_code'] );
}
// Apply filter before curl closes
apply_filters( 'broken-link-checker-curl-before-close', $ch, $content, $this->last_headers );
curl_close( $ch );
$blclog->info(
sprintf(
'HTTP response: %d, duration: %.2f seconds, status text: "%s"',
$result['http_code'],
$result['request_duration'],
isset( $result['status_text'] ) ? $result['status_text'] : 'N/A'
)
);
$use_get = apply_filters( 'blc_use_get_checker', false, $result );
if ( $nobody && ! $result['timeout'] && ! $use_get && ( $result['broken'] || $result['redirect_count'] == 1 ) ) {
// The site in question might be expecting GET instead of HEAD, so lets retry the request
// using the GET verb...but not in cases of timeout, or where we've already done it.
return $this->check( $url, true );
// Note : normally a server that doesn't allow HEAD requests on a specific resource *should*
// return "405 Method Not Allowed". Unfortunately, there are sites that return 404 or
// another, even more general, error code instead. So just checking for 405 wouldn't be enough.
}
// When safe_mode or open_basedir is enabled CURL will be forbidden from following redirects,
// so redirect_count will be 0 for all URLs. As a workaround, set it to 1 when the HTTP
// response codes indicates a redirect but redirect_count is zero.
// Note to self : Extracting the Location header might also be helpful.
if ( ( 0 === absint( $result['redirect_count'] ) ) && ( in_array( $result['http_code'], array( 301, 302, 303, 307 ) ) ) ) {
$result['redirect_count'] = 1;
}
// Build the log from HTTP code and headers.
$log .= '=== ';
if ( $result['http_code'] ) {
$log .= sprintf( __( 'HTTP code : %d', 'broken-link-checker' ), $result['http_code'] );
} else {
$log .= __( '(No response)', 'broken-link-checker' );
}
$log .= " ===\n\n";
$log .= "Response headers\n" . str_repeat( '=', 16 ) . "\n";
$log .= htmlentities( $this->last_headers );
if ( isset( $info['request_header'] ) ) {
$log .= "Request headers\n" . str_repeat( '=', 16 ) . "\n";
$log .= htmlentities( $info['request_header'] );
}
if ( ! $nobody && ( false !== $content ) && $result['broken'] ) {
$log .= "Response HTML\n" . str_repeat( '=', 16 ) . "\n";
$log .= htmlentities( substr( $content, 0, 2048 ) );
}
if ( ! empty( $result['broken'] ) && ! empty( $result['timeout'] ) ) {
$log .= "\n(" . __( "Most likely the connection timed out or the domain doesn't exist.", 'broken-link-checker' ) . ')';
}
$result['log'] = $log;
// The hash should contain info about all pieces of data that pertain to determining if the
// link is working.
$result['result_hash'] = implode(
'|',
array(
$result['http_code'],
! empty( $result['broken'] ) ? 'broken' : '0',
! empty( $result['timeout'] ) ? 'timeout' : '0',
blcLink::remove_query_string( $result['final_url'] ),
)
);
return $result;
}
function read_header( /** @noinspection PhpUnusedParameterInspection */ $ch, $header ) {
$this->last_headers .= $header;
return strlen( $header );
}
}
class blcWPHttp extends blcHttpCheckerBase {
function check( $url ) {
// $url = $this->clean_url( $url );
// Note : Snoopy doesn't work too well with HTTPS URLs.
$result = array(
'broken' => false,
'timeout' => false,
);
$log = '';
// Get the timeout setting from the BLC configuration.
$conf = blc_get_configuration();
$timeout = $conf->options['timeout'];
$start_time = microtime_float();
// Fetch the URL with Snoopy
$snoopy = new WP_Http();
$request_args = array(
'timeout' => $timeout,
'user-agent' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', // masquerade as IE 7
'aa' => 1024 * 5,
);
$request = wp_safe_remote_get( $this->urlencodefix( $url ), $request_args );
// request timeout results in WP ERROR
if ( is_wp_error( $request ) ) {
$result['http_code'] = 0;
$result['timeout'] = true;
$result['message'] = $request::get_error_message();
} else {
$http_resp = $request['http_response'];
$result['http_code'] = $request['response']['status']; // HTTP status code
$result['message'] = $request['response']['message'];
}
// Build the log
$log .= '=== ';
if ( $result['http_code'] ) {
$log .= sprintf( __( 'HTTP code : %d', 'broken-link-checker' ), $result['http_code'] );
} else {
$log .= __( '(No response)', 'broken-link-checker' );
}
$log .= " ===\n\n";
if ( $result['message'] ) {
$log .= $result['message'] . "\n";
}
if ( is_wp_error( $request ) ) {
$log .= __( 'Request timed out.', 'broken-link-checker' ) . "\n";
$result['timeout'] = true;
}
// Determine if the link counts as "broken"
$result['broken'] = $this->is_error_code( $result['http_code'] ) || $result['timeout'];
$log .= '<em>(' . __( 'Using WP HTTP', 'broken-link-checker' ) . ')</em>';
$result['log'] = $log;
$result['final_url'] = $url;
// The hash should contain info about all pieces of data that pertain to determining if the
// link is working.
$result['result_hash'] = implode(
'|',
array(
$result['http_code'],
$result['broken'] ? 'broken' : '0',
$result['timeout'] ? 'timeout' : '0',
blcLink::remove_query_string( $result['final_url'] ),
)
);
return $result;
}
}