diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index cacb48b46..3e9c48617 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -1571,7 +1571,7 @@ private function decode_string_or_url( int $start, int $length ): string { if ( $normal_len > 0 ) { // Clamp to not exceed the end boundary. $normal_len = min( $normal_len, $end - $at ); - $decoded .= substr( $this->css, $at, $normal_len ); + $decoded .= wp_scrub_utf8( substr( $this->css, $at, $normal_len ) ); $at += $normal_len; } @@ -1688,21 +1688,14 @@ private function decode_escape_at( int $offset, &$bytes_consumed ): string { $new_at = $at; $invalid_length = 0; if ( 1 !== _wp_scan_utf8( $this->css, $new_at, $invalid_length, null, 1 ) ) { - /** - * Trouble ahead! - * Bytes at $at are not a valid UTF-8 sequence. - * - * We'll move forward by $invalid_length bytes and continue processing. - * Later on, during the string decoding, we'll replace the invalid bytes with U+FFFD - * via maximal subpart”replacement. - */ - $matched_bytes = $invalid_length; - } else { - $matched_bytes = $new_at - $at; + // Bytes at $at are not a valid UTF-8 sequence. Consume the maximal + // invalid subpart and return U+FFFD per the CSS spec. + $bytes_consumed = $invalid_length; + return "\u{FFFD}"; } - $bytes_consumed = $matched_bytes; - return substr( $this->css, $at, $matched_bytes ); + $bytes_consumed = $new_at - $at; + return substr( $this->css, $at, $bytes_consumed ); } /** diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php index ed164ac24..9e7134f45 100644 --- a/components/DataLiberation/Tests/CSSProcessorTest.php +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -146,6 +146,68 @@ public function test_invalid_utf8_with_two_single_byte_invalid_sequences(): void $this->assertSame( $expected, $actual_tokens ); } + /** + * In the slow path of decode_string_or_url() (triggered by a backslash escape), normal + * text segments must still have invalid UTF-8 bytes replaced with U+FFFD, just + * as the fast path does via wp_scrub_utf8(). + */ + public function test_invalid_utf8_in_normal_segment_combined_with_escape(): void { + // The ident token contains an invalid UTF-8 byte (0xF1) in the "normal" + // segment before a CSS hex escape (\41 = U+0041 = 'A'). The backslash + // triggers the slow path, which previously skipped wp_scrub_utf8() on the + // normal segment. + $css = ".test\xF1\\41name"; + + $expected = array( + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'value' => '.', + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + // raw contains the original bytes. + 'raw' => "test\xF1\\41name", + // value must have 0xF1 replaced with U+FFFD and \41 decoded to 'A'. + 'value' => "test\u{FFFD}Aname", + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) ); + $this->assertSame( $expected, $actual_tokens ); + } + + /** + * When an invalid UTF-8 byte is the character directly after a backslash + * (i.e. it is the escaped character itself), decode_escape_at() must replace + * the invalid byte with U+FFFD. + */ + public function test_invalid_utf8_as_escaped_character(): void { + // The CSS `.\xF1` is a delim + ident containing a lone invalid byte. + // Adding a backslash before the invalid byte makes it an escape sequence: + // `.\\\xF1` => delim + ident whose value is the escaped 0xF1 byte. + $css = ".a\\\xF1b"; + + $expected = array( + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'value' => '.', + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => "a\\\xF1b", + // The escaped 0xF1 must be replaced with U+FFFD. + 'value' => "a\u{FFFD}b", + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) ); + $this->assertSame( $expected, $actual_tokens ); + } + /** * Legacy test to ensure basic tokenization still works. */