class-replace-link.php
5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
<?php
/**
* Executes the `Replace` action on Broken Links.
*
* @link https://wordpress.org/plugins/broken-link-checker/
* @since 2.1
*
* @author WPMUDEV (https://wpmudev.com)
* @package WPMUDEV_BLC\App\Broken_Links_Actions
*
* @copyright (c) 2022, Incsub (http://incsub.com)
*/
namespace WPMUDEV_BLC\App\Broken_Links_Actions\Processors;
// Abort if called directly.
defined( 'WPINC' ) || die;
use WPMUDEV_BLC\Core\Utils\Abstracts\Base;
/**
* Class Scan_Data
*
* @package WPMUDEV_BLC\App\Broken_Links_Actions\Processors
*/
class Replace_Link extends Base {
public function execute( string $content = '', string $link = '', string $new_link = '' ) {
if ( empty( $this->get_target_tags() ) ) {
return $content;
}
$link = trim( $link, '\'"' );
foreach ( $this->get_target_tags() as $tag_name => $tag_atts ) {
if ( ! empty( $tag_atts ) ) {
foreach ( $tag_atts as $tag_att ) {
// Dom is preferred method, but it will "fix" any broken/corrupt markup which in some cases the broken markup might be intentional. Therefor by default we use regex.
if ( apply_filters( 'wpmudev_blc_link_action_use_dom', false, 'replace', $link ) ) {
$dom = new \DOMDocument();
$html = '';
libxml_use_internal_errors( true );
$dom->loadHTML( $content );
foreach ( $dom->getElementsByTagName( 'a' ) as $dom_link ) {
$old_link = $dom_link->getAttribute( "href" );
if ( strcasecmp( trailingslashit( $old_link ), trailingslashit( $link ) ) == 0 ) {
$dom_link->setAttribute( 'href', $new_link );
}
}
$dom->saveHtml();
foreach ( $dom->getElementsByTagName( 'body' )->item( 0 )->childNodes as $element ) {
$html .= $dom->saveXML( $element, LIBXML_NOEMPTYTAG );
}
if ( ! empty( $html ) ) {
$content = $html;
}
} else {
// Regex is default way, even if not preferred. The reason we use regex by default is because, DOM will fix any invalid markup which might not be desired.
// /<a\s[^>]*href=["'](.*?)["']>(.*)/gm : 231 steps, 0,1 ms
// /<a\s[^>]*href=["'](.*?)["']>(.*)<\/a>/gm : 255 steps, 0,1 ms
// $regexp = "<{$tag_name}\s[^>]*{$tag_att}=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/{$tag_name}>";
$regexp = "<{$tag_name}\s[^>]*{$tag_att}=(\"??)([^\" >]*?)[^>]*>(.*)<\/{$tag_name}>";
$content = preg_replace_callback(
"/$regexp/siU",
function ( $match ) use ( $link, $new_link, $tag_att ) {
$old_link = untrailingslashit( trim( $match[2], '\'"' ) );
$links_match = false;
/**
* If link doesn't have Host part, it is internal link. All internal links are stored as relative urls in Queue.
* If link does have the Host part, then it's an external link.
* For internal links we need to test 3 cases:
* 1. Relative urls (that is how it is stored in Queue)
* 2. Absolute url.
* 3. Absolute url without scheme ( //site.com )
* Engine treats all those types as full urls
*/
if ( empty( wp_parse_url( $link, PHP_URL_HOST ) ) ) {
// Check Relative, Absolute and Absolute without host urls for $old_link.
$site_url = site_url();
$scheme = wp_parse_url( $site_url, PHP_URL_SCHEME ) . ':';
$absolute_link = $site_url . $link;
$semi_absolute_link = str_replace( $scheme, '', $absolute_link );
$links_match = strcasecmp( trailingslashit( $link ), trailingslashit( $old_link ) ) == 0 ||
strcasecmp( trailingslashit( $absolute_link ), trailingslashit( $old_link ) ) == 0 ||
strcasecmp( trailingslashit( $semi_absolute_link ), trailingslashit( $old_link ) ) == 0;
} else {
// Check $old_link normally.
$links_match = strcasecmp( trailingslashit( $old_link ), trailingslashit( $link ) ) == 0;
}
//if ( $old_link === $link ) {
if ( $links_match ) {
// Let's not use str_replace, so we avoid replacing url in tag content or other tag atts.
// We have stripped trailing slashes, but we use `/?` in regex as link might have or might not have trailing slash.
return preg_replace( "#({$tag_att}=[\"|\'])" . $old_link . '/?(["|\'])#i', '\1' . $new_link . '\2', $match[0], 1 );
} else {
return $match[0];
}
},
$content
);
}
}
}
}
return $content;
/*
if ( ! class_exists( '\blcUtility' ) ) {
include_once BLC_DIRECTORY_LEGACY . '/includes/utility-class.php';
}
foreach ( $this->get_target_tags() as $tag_name => $tag_att ) {
// Use the same method that Local BLC uses.
// We're setting `selfclosing` (3rd param) to `null` so that can be determined by method.
$content_links = \blcUtility::extract_tags( $content, $tag_name, null, true );
if ( ! empty( $content_links ) ) {
$offset = 0;
foreach ( $content_links as $content_link ) {
$content = substr_replace( $content, $new_link, $content_link['offset'] + $offset, strlen( $content_link['full_tag'] ) );
//Update the replacement offset
$offset += ( strlen( $new_link ) - strlen( $content_link['full_tag'] ) );
}
}
}
*/
}
public function get_target_tags() {
return apply_filters(
'wpmudev_blc_replace_target_tags',
array(
'a' => array( 'href' ),
//'img' => array( 'src', 'srcset ),
//'iframe' => 'src'
)
);
}
}