From c5610871d021b7b7f6fbfc171d7caebe0b86891c Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Wed, 24 Jun 2026 13:06:59 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=8C=20fix:=20Prevent=20ReDoS=20in=20Yo?= =?UTF-8?q?uTube=20URL=20Extraction=20for=20URL=20Context=20(#13937)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ๐Ÿ›ก๏ธ fix: Prevent ReDoS in YouTube URL extraction for URL Context The YouTube detection/strip regexes ran as a single global pass over authenticated, user-controlled chat text. The engine could restart at every `youtube.com/watch?` occurrence and the lazy `\S*?&` rescanned the rest of a long non-whitespace token each time, giving quadratic CPU behavior that blocks the Node event loop (DoS) for Google/Vertex agents with url_context enabled. - Tokenize on whitespace and skip tokens longer than a real URL, and cap the total text scanned, so work is bounded to O(n). URLs never contain whitespace, so per-token matching is equivalent. - Replace the lazy unbounded `(?:\S*?&)?` with the delimiter-bounded `(?:[^\s&]*&)*` (no behavior change for real URLs). - Apply the same discipline to the strip path. - Add ReDoS regression tests; a 3MB crafted input now completes in <10ms. * ๐Ÿ›ก๏ธ fix: Bound the YouTube strip scan by the same total budget Address Codex P1: the strip path applied only the per-token cap, so a valid URL followed by many sub-cap malformed tokens still regex-scanned the entire message (~1s on 3MB). Injected ids only come from the first MAX_YOUTUBE_SCAN_CHARS (extraction's cap), so a link beyond that is never in injectedIds anyway; cap the strip scan at the same budget and leave the tail verbatim. 3MB PoC: ~1s -> ~14ms. * ๐Ÿงฌ fix: Make YouTube URL matching linear instead of capping the scan The previous fix bounded the scan with per-token + total-scan caps, but the total-scan cap discarded content: a URL near the end of a long prompt was missed (extraction sliced to 100k), and large prepended file/quote context exhausted the strip budget before the real URL (strip skipped it). Codex round 2 (P2 x2). Replace the backtracking-prone matcher with a linear one: a single regex captures host + path/query (greedy `[^\s]*`, bounded `{1,63}`/`{0,10}` subdomain repetition, no lazy/ambiguous quantifier), and the video id is parsed from the capture afterwards. This is O(n) over arbitrary input, so the scan caps (and the content they discarded) are removed entirely. Extraction and stripping now scan the whole message linearly. Benchmarks (no caps): 3MB attack token ~3ms, 3MB many-token ~4ms, valid URL at end of 3MB found in ~18ms. Adds regression tests for long-prompt extraction and stripping past large prepended context. * ๐Ÿ”ก fix: Match adjacent + capitalized YouTube URLs after linear rewrite Codex round 3 (regressions from the linear matcher): - Stop the path capture at URL-list delimiters (`,` `)` `]` `<` `>`, none of which occur in a real YouTube URL) so adjacent links in one token (comma-separated or markdown `](url1)](url2)`) are matched separately instead of swallowed. - Lowercase the path segment before matching route names, since the detection regex is case-insensitive (`/WATCH?v=`, `/EMBED/`). * ๐Ÿ”’ fix: Allowlist URL chars + bounded path parsing for YouTube matching Codex round 4: - Replace the path stop-char blocklist with an allowlist of characters that occur in real YouTube URLs, so adjacent links separated by any prose delimiter (`;`, `|`, etc.) are matched separately instead of swallowed. - Parse the route with anchored, bounded regexes instead of `path.split('/')`, so a malformed path of millions of slashes no longer allocates a huge array / blocks the event loop. Also bounds the `v=` param read. * ๐ŸŽฏ fix: Restrict YouTube matcher to recognized video routes Codex round 5: a nested video URL inside an unrecognized YouTube URL (`youtube.com/redirect?q=https://youtu.be/`) was swallowed by the greedy match and missed. Restrict the matcher to recognized single-video forms (youtu.be/, /(shorts|live|embed|v)/, /watch?) so an unrecognized route doesn't match and the global scan continues into the nested link. Stays linear (verified: 3MB redirect/slash/host floods all <25ms) and keeps the allowlist tail so adjacent links still split. Adds nested-URL + unrecognized-route regression tests. * ๐ŸŽฌ fix: Find nested watch links + skip malformed v= duplicates Codex round 6 (P3 watch-query edges): - Drop `:` from the path allowlist. It never occurs in a real YouTube path/query, but `://` of a nested URL does โ€” so `watch?url=https://youtu.be/` now stops the watch match and the scan finds the nested link. - Scan every `v=` param and return the first valid 11-char id, so a malformed earlier `v=` (e.g. `watch?v=tooShort&v=`) no longer shadows a later valid one. * ๐Ÿงน fix: Strip whole YouTube URL incl. colon-containing trailing params Codex round 7: dropping `:` from the tail (round 6) made the strip path stop mid-URL on a URL-valued param (`watch?v=&next=https://example.com`), leaving `://example.com` orphaned. Use a separate strip matcher whose tail re-includes `:` so the whole URL token is removed, while detection keeps the `:`-excluded tail to still find nested video links. Also corrects a stale "per-token cap" comment left over from the linear rewrite. --- .../api/src/endpoints/google/youtube.spec.ts | 161 ++++++++++++++++++ packages/api/src/endpoints/google/youtube.ts | 109 +++++++++--- 2 files changed, 242 insertions(+), 28 deletions(-) diff --git a/packages/api/src/endpoints/google/youtube.spec.ts b/packages/api/src/endpoints/google/youtube.spec.ts index 5a4c42a9c8..84fe7c230e 100644 --- a/packages/api/src/endpoints/google/youtube.spec.ts +++ b/packages/api/src/endpoints/google/youtube.spec.ts @@ -78,6 +78,54 @@ describe('extractYouTubeUrls', () => { expect(extractYouTubeUrls(text)).toEqual([WATCH('dQw4w9WgXcQ')]); }); + it('extracts both URLs from a comma-separated pair in one token', () => { + const text = 'https://youtu.be/aaaaaaaaaaa,https://youtu.be/bbbbbbbbbbb'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('aaaaaaaaaaa'), WATCH('bbbbbbbbbbb')]); + }); + + it('extracts adjacent markdown-style links', () => { + const text = '[a](https://youtu.be/aaaaaaaaaaa)[b](https://youtu.be/bbbbbbbbbbb)'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('aaaaaaaaaaa'), WATCH('bbbbbbbbbbb')]); + }); + + it('extracts adjacent links separated by semicolons or pipes', () => { + expect(extractYouTubeUrls('https://youtu.be/aaaaaaaaaaa;https://youtu.be/bbbbbbbbbbb')).toEqual( + [WATCH('aaaaaaaaaaa'), WATCH('bbbbbbbbbbb')], + ); + expect(extractYouTubeUrls('https://youtu.be/aaaaaaaaaaa|https://youtu.be/bbbbbbbbbbb')).toEqual( + [WATCH('aaaaaaaaaaa'), WATCH('bbbbbbbbbbb')], + ); + }); + + it('finds a nested youtu.be link inside an unrecognized YouTube URL', () => { + const text = 'https://www.youtube.com/redirect?q=https://youtu.be/dQw4w9WgXcQ'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('finds a nested youtu.be link inside a watch URL with no own v=', () => { + const text = 'https://www.youtube.com/watch?url=https://youtu.be/dQw4w9WgXcQ'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('skips a malformed v= and uses a later valid one', () => { + const text = 'https://www.youtube.com/watch?v=tooShort&list=x&v=dQw4w9WgXcQ'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('ignores unrecognized YouTube routes with no nested video', () => { + expect(extractYouTubeUrls('https://www.youtube.com/results?search_query=cats')).toEqual([]); + expect(extractYouTubeUrls('https://www.youtube.com/@SomeChannel')).toEqual([]); + }); + + it('matches capitalized watch/embed paths (case-insensitive)', () => { + expect(extractYouTubeUrls('https://www.youtube.com/WATCH?v=dQw4w9WgXcQ')).toEqual([ + WATCH('dQw4w9WgXcQ'), + ]); + expect(extractYouTubeUrls('https://www.youtube.com/EMBED/dQw4w9WgXcQ')).toEqual([ + WATCH('dQw4w9WgXcQ'), + ]); + }); + it('ignores non-YouTube URLs', () => { expect(extractYouTubeUrls('https://example.com/watch?v=dQw4w9WgXcQ')).toEqual([]); }); @@ -324,6 +372,23 @@ describe('appendYouTubeVideoParts', () => { expect(mediaParts).toHaveLength(1); expect((mediaParts[0] as Record).fileUri).toBe(WATCH('aaaaaaaaaaa')); }); + + it('strips a watch URL fully when it has a URL-valued param after the id (no orphan)', () => { + const text = + 'summarize https://www.youtube.com/watch?v=dQw4w9WgXcQ&next=https://example.com please'; + const result = appendYouTubeVideoParts({ enabled: true, text, content: text, max: 5 }); + + const parts = result as MessageContentComplex[]; + const textPart = parts.find((p) => (p as Record).type === 'text') as { + text: string; + }; + expect(textPart.text).toBe('summarize please'); + expect(textPart.text).not.toContain('example.com'); + + const mediaParts = parts.filter((p) => (p as Record).type === 'media'); + expect(mediaParts).toHaveLength(1); + expect((mediaParts[0] as Record).fileUri).toBe(WATCH('dQw4w9WgXcQ')); + }); }); describe('resolveYouTubeInjectionConfig', () => { @@ -363,3 +428,99 @@ describe('resolveYouTubeInjectionConfig', () => { expect(resolveYouTubeInjectionConfig({ provider: Providers.GOOGLE })).toEqual({ max: 1 }); }); }); + +describe('ReDoS safety', () => { + it('returns quickly for a long malformed watch token with no v= parameter', () => { + /** One ~600KB non-whitespace token; before the fix this exhibited quadratic backtracking. */ + const malicious = 'https://www.youtube.com/watch?'.repeat(20000); + const start = Date.now(); + const result = extractYouTubeUrls(malicious, 5); + const elapsed = Date.now() - start; + + expect(result).toEqual([]); + expect(elapsed).toBeLessThan(1000); + }); + + it('returns quickly for a malformed path of millions of slashes', () => { + const malicious = `https://www.youtube.com/${'/'.repeat(3_000_000)}`; + const start = Date.now(); + const result = extractYouTubeUrls(malicious, 5); + + expect(result).toEqual([]); + expect(Date.now() - start).toBeLessThan(1000); + }); + + it('returns quickly for many medium malformed watch tokens', () => { + const token = 'https://www.youtube.com/watch?'.repeat(50); // ~1.5KB malformed token + const malicious = `${token} `.repeat(2000); // ~3MB of whitespace-separated malformed tokens + const start = Date.now(); + const result = extractYouTubeUrls(malicious, 5); + + expect(result).toEqual([]); + expect(Date.now() - start).toBeLessThan(1000); + }); + + it('still extracts a valid URL that appears before a huge token', () => { + const text = `https://youtu.be/dQw4w9WgXcQ ${'x'.repeat(60000)}`; + expect(extractYouTubeUrls(text, 5)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('injects + strips quickly when a valid URL is followed by a huge malformed token', () => { + const text = `https://youtu.be/dQw4w9WgXcQ ${'https://www.youtube.com/watch?'.repeat(20000)}`; + const start = Date.now(); + const result = appendYouTubeVideoParts({ enabled: true, text, content: text, max: 5 }); + const elapsed = Date.now() - start; + + expect(elapsed).toBeLessThan(1000); + const mediaParts = (result as MessageContentComplex[]).filter( + (p) => (p as Record).type === 'media', + ); + expect(mediaParts).toHaveLength(1); + expect((mediaParts[0] as Record).fileUri).toBe(WATCH('dQw4w9WgXcQ')); + }); + + it('strips quickly when a valid URL precedes ~3MB of medium malformed tokens', () => { + /** Exercises the strip path: extraction finds the valid URL, then strip must stay bounded. */ + const tail = `${'https://www.youtube.com/watch?'.repeat(50)} `.repeat(2000); + const text = `https://youtu.be/dQw4w9WgXcQ ${tail}`; + const start = Date.now(); + const result = appendYouTubeVideoParts({ enabled: true, text, content: text, max: 5 }); + const elapsed = Date.now() - start; + + expect(elapsed).toBeLessThan(1000); + const mediaParts = (result as MessageContentComplex[]).filter( + (p) => (p as Record).type === 'media', + ); + expect(mediaParts).toHaveLength(1); + expect((mediaParts[0] as Record).fileUri).toBe(WATCH('dQw4w9WgXcQ')); + }); +}); + +describe('long-text handling (no content discarded)', () => { + it('extracts a URL that appears far into a long prompt', () => { + const text = `${'word '.repeat(40000)}see https://youtu.be/dQw4w9WgXcQ`; // ~200KB before the URL + expect(extractYouTubeUrls(text, 5)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('strips the injected URL even when large context is prepended to the content', () => { + /** Mirrors AgentClient prepending file/quote context to `content` while extraction uses `text`. */ + const prompt = 'summarize https://youtu.be/dQw4w9WgXcQ'; + const prependedContext = 'attached context line.\n'.repeat(8000); // ~180KB preamble + const result = appendYouTubeVideoParts({ + enabled: true, + text: prompt, + content: prependedContext + prompt, + max: 5, + }); + + const parts = result as MessageContentComplex[]; + const textPart = parts.find((p) => (p as Record).type === 'text') as { + text: string; + }; + expect(textPart.text).not.toContain('youtu.be/dQw4w9WgXcQ'); + expect(textPart.text).toContain('attached context line.'); + const mediaParts = parts.filter((p) => (p as Record).type === 'media'); + expect(mediaParts).toHaveLength(1); + expect((mediaParts[0] as Record).fileUri).toBe(WATCH('dQw4w9WgXcQ')); + }); +}); diff --git a/packages/api/src/endpoints/google/youtube.ts b/packages/api/src/endpoints/google/youtube.ts index 58f3a08fe8..34cbb88534 100644 --- a/packages/api/src/endpoints/google/youtube.ts +++ b/packages/api/src/endpoints/google/youtube.ts @@ -60,37 +60,84 @@ export function resolveYouTubeInjectionConfig(params: { provider?: string; model return { max: isGemini25OrLater(params.model) ? DEFAULT_MAX_YOUTUBE_PARTS : 1 }; } +/** 11-char video id. */ +const ID = '[A-Za-z0-9_-]{11}'; /** - * Host + path alternation shared by the detection and strip regexes. Accepts any YouTube - * subdomain (`www.`, `m.`, `music.`, ...) for youtube.com plus youtu.be and youtube-nocookie - * embed links. The capture group is always the 11-char video id. + * Allowlist of characters that occur in a YouTube URL path/query; anything else ends the match. + * The detection tail excludes `:` โ€” it never appears in a real YouTube path/query, but `://` of a + * *nested* URL does, so excluding it lets the scan stop and find e.g. + * `watch?url=https://youtu.be/`. The strip tail re-adds `:` so the whole URL token (including a + * trailing URL-valued param like `&next=https://example.com`) is consumed and nothing is orphaned. */ -const YOUTUBE_HOST_PATH = - '(?:(?:[a-z0-9-]+\\.)*youtube\\.com\\/(?:watch\\?(?:\\S*?&)?v=|shorts\\/|live\\/|embed\\/|v\\/)' + - '|(?:www\\.)?youtube-nocookie\\.com\\/embed\\/' + - '|youtu\\.be\\/)'; +const DETECT_TAIL = '[A-Za-z0-9\\-._~%/?&=#@+]*'; +const STRIP_TAIL = '[A-Za-z0-9\\-._~%/?&=#@+:]*'; /** - * Matches YouTube watch/share/shorts/live/embed (incl. youtube-nocookie) URLs and captures the - * 11-char video id. The leading lookbehind rejects hosts embedded in a longer domain (e.g. - * `notyoutube.com`, `evil-youtube.com`); the trailing lookahead rejects ids inside a longer token. + * Builds the linear YouTube URL matcher (restricted to recognized single-video forms): + * - `youtu.be/` (group 1 = id) + * - `youtube[-nocookie].com/(shorts|live|embed|v)/` (group 2 = id) + * - `youtube[-nocookie].com/watch?` (group 3 = query, `v=` parsed afterwards) + * + * Properties that matter: + * - Linear (no ReDoS): each branch consumes its match in a single greedy pass, the `v=` location + * is parsed from the captured query (not re-scanned per occurrence), and the subdomain + * repetition is bounded (`{1,63}` label x `{0,10}` labels). Needs no scan caps. + * - Restricting to recognized routes means an UNrecognized YouTube URL (e.g. `/redirect?q=...`) + * does not match, so the global scan continues and still finds a nested `youtu.be/`. + * - The path/query allowlist ends the match at prose delimiters, so adjacent links in one token + * (comma/semicolon/pipe-separated, markdown `](url1)](url2)`) are matched separately. + * - The leading lookbehind rejects hosts embedded in a longer domain (`notyoutube.com`, + * `evil-youtube.com`). */ -const YOUTUBE_URL_REGEX = new RegExp( - `(?): string { - const replaced = text.replace(YOUTUBE_URL_STRIP_REGEX, (match, videoId) => { - if (!injectedIds.has(videoId)) { + if (text.length === 0) { + return text; + } + let changed = false; + const replaced = text.replace(YOUTUBE_STRIP_REGEX, (match, youtuBeId, pathId, watchQuery) => { + const id = videoIdFromGroups(youtuBeId, pathId, watchQuery); + if (id == null || !injectedIds.has(id)) { return match; } /** Test the whole URL: the timestamp can precede `v=` (e.g. `watch?t=90&v=`). */ if (YOUTUBE_TIMESTAMP_REGEX.test(match)) { return match; } + changed = true; return ''; }); - if (replaced === text) { + if (!changed) { return text; } return replaced @@ -139,8 +192,8 @@ export function extractYouTubeUrls(text?: string | null, max?: number): string[] YOUTUBE_URL_REGEX.lastIndex = 0; let match: RegExpExecArray | null; while ((match = YOUTUBE_URL_REGEX.exec(text)) !== null) { - const videoId = match[1]; - if (seen.has(videoId)) { + const videoId = videoIdFromGroups(match[1], match[2], match[3]); + if (videoId == null || seen.has(videoId)) { continue; } seen.add(videoId);