Skip to content

Commit 7d0690d

Browse files
authored
Add encodeURI(Component) and decodeURI(Component) (#1733)
1 parent 86dc8df commit 7d0690d

File tree

10 files changed

+10960
-0
lines changed

10 files changed

+10960
-0
lines changed

std/assembly/error.ts

+7
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,10 @@ export class SyntaxError extends Error {
3535
this.name = "SyntaxError";
3636
}
3737
}
38+
39+
export class URIError extends Error {
40+
constructor(message: string = "") {
41+
super(message);
42+
this.name = "URIError";
43+
}
44+
}

std/assembly/index.d.ts

+11
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,14 @@ declare function fmod(x: f64, y: f64): f64;
222222
declare function fmodf(x: f32, y: f32): f32;
223223
/** Returns the number of parameters in the given function signature type. */
224224
declare function lengthof<T extends (...args: any[]) => any>(func?: T): i32;
225+
/** Encodes a text string as a valid Uniform Resource Identifier (URI). */
226+
declare function encodeURI(str: string): string;
227+
/** Encodes a text string as a valid component of a Uniform Resource Identifier (URI). */
228+
declare function encodeURIComponent(str: string): string;
229+
/** Decodes a Uniform Resource Identifier (URI) previously created by encodeURI. */
230+
declare function decodeURI(str: string): string;
231+
/** Decodes a Uniform Resource Identifier (URI) component previously created by encodeURIComponent. */
232+
declare function decodeURIComponent(str: string): string;
225233

226234
/** Atomic operations. */
227235
declare namespace atomic {
@@ -1771,6 +1779,9 @@ declare class TypeError extends Error { }
17711779
/** Class for indicating an error when trying to interpret syntactically invalid code. */
17721780
declare class SyntaxError extends Error { }
17731781

1782+
/** Class for indicating an error when a global URI handling function was used in a wrong way. */
1783+
declare class URIError extends Error { }
1784+
17741785
interface Boolean {
17751786
toString(radix?: number): string;
17761787
}

std/assembly/uri.ts

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import { encode, decode, URI_UNSAFE, URL_UNSAFE } from "./util/uri";
2+
3+
export function encodeURI(str: string): string {
4+
return changetype<string>(encode(changetype<usize>(str), str.length, URI_UNSAFE));
5+
}
6+
7+
export function decodeURI(str: string): string {
8+
return changetype<string>(decode(changetype<usize>(str), str.length, false));
9+
}
10+
11+
export function encodeURIComponent(str: string): string {
12+
return changetype<string>(encode(changetype<usize>(str), str.length, URL_UNSAFE));
13+
}
14+
15+
export function decodeURIComponent(str: string): string {
16+
return changetype<string>(decode(changetype<usize>(str), str.length, true));
17+
}

std/assembly/util/error.ts

+4
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,7 @@ export const E_ALREADY_PINNED: string = "Object already pinned";
4444
// @ts-ignore: decorator
4545
@lazy @inline
4646
export const E_NOT_PINNED: string = "Object is not pinned";
47+
48+
// @ts-ignore: decorator
49+
@lazy @inline
50+
export const E_URI_MALFORMED: string = "URI malformed";

std/assembly/util/string.ts

+2
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,7 @@ import { ipow32 } from "../math";
458458
// @ts-ignore: decorator
459459
@inline
460460
export const enum CharCode {
461+
PERCENT = 0x25,
461462
PLUS = 0x2B,
462463
MINUS = 0x2D,
463464
DOT = 0x2E,
@@ -484,6 +485,7 @@ export const enum CharCode {
484485
e = 0x65,
485486
n = 0x6E,
486487
o = 0x6F,
488+
u = 0x75,
487489
x = 0x78,
488490
z = 0x7A
489491
}

std/assembly/util/uri.ts

+276
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
import { E_URI_MALFORMED } from "./error";
2+
import { CharCode } from "./string";
3+
4+
// Truncated lookup boolean table that helps us quickly determine
5+
// if a char needs to be escaped for URIs (RFC 2396).
6+
// @ts-ignore: decorator
7+
@lazy export const URI_UNSAFE = memory.data<u8>([
8+
/* skip 32 + 1 always set to '1' head slots
9+
*/ 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
10+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
11+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
12+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
13+
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
14+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, /*
15+
skip 128 + 1 always set to '1' tail slots */
16+
]);
17+
18+
// Truncated lookup boolean table that helps us quickly determine
19+
// if a char needs to be escaped for URLs (RFC 3986).
20+
// @ts-ignore: decorator
21+
@lazy export const URL_UNSAFE = memory.data<u8>([
22+
/* skip 32 + 1 always set to '1' head slots
23+
*/ 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
24+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
25+
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
27+
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, /*
29+
skip 128 + 1 always set to '1' tail slots */
30+
]);
31+
32+
// Truncated lookup boolean table for determine reserved chars: ;/?:@&=+$,#
33+
// @ts-ignore: decorator
34+
@lazy export const URI_RESERVED = memory.data<u8>([
35+
/* skip 32 + 3 always set to '0' head slots
36+
*/ 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
37+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
38+
1, /* skip 191 always set to '0' tail slots */
39+
]);
40+
41+
export function encode(src: usize, len: usize, table: usize): usize {
42+
if (!len) return src;
43+
44+
var i: usize = 0, offset: usize = 0, outSize = len << 1;
45+
var dst = __new(outSize, idof<String>());
46+
47+
while (i < len) {
48+
let org = i;
49+
let c: u32, c1: u32;
50+
// fast scan a check chars until it valid ASCII
51+
// and safe for copying withoud escaping.
52+
do {
53+
c = <u32>load<u16>(src + (i << 1));
54+
// is it valid ASII and safe?
55+
if (c - 33 < 94) { // 127 - 33
56+
if (load<u8>(table + (c - 33))) break;
57+
} else break;
58+
} while (++i < len);
59+
60+
// if we have some safe range of sequence just copy it without encoding
61+
if (i > org) {
62+
let size = i - org << 1;
63+
if (offset + size > outSize) {
64+
outSize = offset + size;
65+
dst = __renew(dst, outSize);
66+
}
67+
// TODO: should we optimize for short cases like 2 byte size?
68+
memory.copy(
69+
dst + offset,
70+
src + (org << 1),
71+
size
72+
);
73+
offset += size;
74+
// return if we reach end on input string
75+
if (i >= len) break;
76+
}
77+
78+
// decode UTF16 with checking for unpaired surrogates
79+
if (c >= 0xD800) {
80+
if (c >= 0xDC00 && c <= 0xDFFF) {
81+
throw new URIError(E_URI_MALFORMED);
82+
}
83+
if (c <= 0xDBFF) {
84+
if (i >= len) {
85+
throw new URIError(E_URI_MALFORMED);
86+
}
87+
c1 = <u32>load<u16>(src + (++i << 1));
88+
if (c1 < 0xDC00 || c1 > 0xDFFF) {
89+
throw new URIError(E_URI_MALFORMED);
90+
}
91+
c = (((c & 0x3FF) << 10) | (c1 & 0x3FF)) + 0x10000;
92+
}
93+
}
94+
95+
let estSize = offset + (c < 0x80 ? 1 * 6 : 4 * 6);
96+
if (estSize > outSize) {
97+
// doubling estimated size but only for greater than one
98+
// input lenght due to we already estemated it for worst case
99+
outSize = len > 1 ? estSize << 1 : estSize;
100+
dst = __renew(dst, outSize);
101+
}
102+
103+
if (c < 0x80) {
104+
// encode ASCII unsafe code point
105+
storeHex(dst, offset, c);
106+
offset += 6;
107+
} else {
108+
// encode UTF-8 unsafe code point
109+
if (c < 0x800) {
110+
storeHex(dst, offset, (c >> 6) | 0xC0);
111+
offset += 6;
112+
} else {
113+
if (c < 0x10000) {
114+
storeHex(dst, offset, (c >> 12) | 0xE0);
115+
offset += 6;
116+
} else {
117+
storeHex(dst, offset, (c >> 18) | 0xF0);
118+
offset += 6;
119+
storeHex(dst, offset, (c >> 12 & 0x3F) | 0x80);
120+
offset += 6;
121+
}
122+
storeHex(dst, offset, (c >> 6 & 0x3F) | 0x80);
123+
offset += 6;
124+
}
125+
storeHex(dst, offset, (c & 0x3F) | 0x80);
126+
offset += 6;
127+
}
128+
++i;
129+
}
130+
// shink output string buffer if necessary
131+
if (outSize > offset) {
132+
dst = __renew(dst, offset);
133+
}
134+
return dst;
135+
}
136+
137+
export function decode(src: usize, len: usize, component: bool): usize {
138+
if (!len) return src;
139+
140+
var i: usize = 0, offset: usize = 0, ch: u32 = 0;
141+
var dst = __new(len << 1, idof<String>());
142+
143+
while (i < len) {
144+
let org = i;
145+
while (i < len && (ch = load<u16>(src + (i << 1))) != CharCode.PERCENT) i++;
146+
147+
if (i > org) {
148+
let size = i - org << 1;
149+
// TODO: should we optimize for short cases like 2 byte size?
150+
memory.copy(
151+
dst + offset,
152+
src + (org << 1),
153+
size
154+
);
155+
offset += size;
156+
if (i >= len) break;
157+
}
158+
159+
// decode hex
160+
if (
161+
i + 2 >= len ||
162+
ch != CharCode.PERCENT ||
163+
(ch = loadHex(src, i + 1 << 1)) == -1
164+
) throw new URIError(E_URI_MALFORMED);
165+
166+
i += 3;
167+
if (ch < 0x80) {
168+
if (!component && isReserved(ch)) {
169+
ch = CharCode.PERCENT;
170+
i -= 2;
171+
}
172+
} else {
173+
// decode UTF-8 sequence
174+
let nb = utf8LenFromUpperByte(ch);
175+
// minimal surrogate: 2 => 0x80, 3 => 0x800, 4 => 0x10000, _ => -1
176+
let lo: u32 = 1 << (17 * nb >> 2) - 1;
177+
// mask: 2 => 31, 3 => 15, 4 => 7, _ => 0
178+
ch &= nb ? (0x80 >> nb) - 1 : 0;
179+
180+
while (--nb != 0) {
181+
let c1: u32;
182+
// decode hex
183+
if (
184+
i + 2 >= len ||
185+
load<u16>(src + (i << 1)) != CharCode.PERCENT ||
186+
(c1 = loadHex(src, i + 1 << 1)) == -1
187+
) throw new URIError(E_URI_MALFORMED);
188+
189+
i += 3;
190+
if ((c1 & 0xC0) != 0x80) {
191+
ch = 0;
192+
break;
193+
}
194+
ch = (ch << 6) | (c1 & 0x3F);
195+
}
196+
197+
// check if UTF8 code point properly fit into invalid UTF16 encoding
198+
if (ch < lo || lo == -1 || ch > 0x10FFFF || (ch >= 0xD800 && ch < 0xE000)) {
199+
throw new URIError(E_URI_MALFORMED);
200+
}
201+
202+
// encode UTF16
203+
if (ch >= 0x10000) {
204+
ch -= 0x10000;
205+
let lo = ch >> 10 | 0xD800;
206+
let hi = (ch & 0x03FF) | 0xDC00;
207+
store<u32>(dst + offset, lo | (hi << 16));
208+
offset += 4;
209+
continue;
210+
}
211+
}
212+
store<u16>(dst + offset, ch);
213+
offset += 2;
214+
}
215+
216+
assert(offset <= (len << 1));
217+
// shink output string buffer if necessary
218+
if ((len << 1) > offset) {
219+
dst = __renew(dst, offset);
220+
}
221+
return dst;
222+
}
223+
224+
function storeHex(dst: usize, offset: usize, ch: u32): void {
225+
// @ts-ignore: decorator
226+
const HEX_CHARS = memory.data<u8>([
227+
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
228+
0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46
229+
]);
230+
231+
store<u16>(dst + offset, CharCode.PERCENT, 0); // %
232+
store<u32>(
233+
dst + offset,
234+
<u32>load<u8>(HEX_CHARS + (ch >> 4 & 0x0F)) |
235+
<u32>load<u8>(HEX_CHARS + (ch & 0x0F)) << 16,
236+
2
237+
); // XX
238+
}
239+
240+
function loadHex(src: usize, offset: usize): u32 {
241+
let c0 = <u32>load<u16>(src + offset, 0);
242+
let c1 = <u32>load<u16>(src + offset, 2);
243+
return isHex(c0) && isHex(c1)
244+
? fromHex(c0) << 4 | fromHex(c1)
245+
: -1;
246+
}
247+
248+
// @ts-ignore: decorator
249+
@inline function fromHex(ch: u32): u32 {
250+
return (ch | 32) % 39 - 9;
251+
}
252+
253+
// @ts-ignore: decorator
254+
@inline function utf8LenFromUpperByte(c0: u32): u32 {
255+
// same as
256+
// if (c0 - 0xC0 <= 0xDF - 0xC0) return 2;
257+
// if (c0 - 0xE0 <= 0xEF - 0xE0) return 3;
258+
// if (c0 - 0xF0 <= 0xF7 - 0xF0) return 4;
259+
// return 0;
260+
return c0 - 0xC0 < 56
261+
? clz(~(c0 << 24))
262+
: 0;
263+
}
264+
265+
// @ts-ignore: decorator
266+
@inline function isReserved(ch: u32): bool {
267+
return ch - 35 < 30
268+
? <bool>load<u8>(URI_RESERVED + (ch - 35))
269+
: false;
270+
}
271+
272+
// @ts-ignore: decorator
273+
@inline function isHex(ch: u32): bool {
274+
// @ts-ignore
275+
return (ch - CharCode._0 < 10) | ((ch | 32) - CharCode.a < 6);
276+
}

tests/compiler/std/uri.json

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"asc_flags": [
3+
],
4+
"asc_rtrace": true
5+
}

0 commit comments

Comments
 (0)