diff --git a/packages/api/src/endpoints/google/youtube.spec.ts b/packages/api/src/endpoints/google/youtube.spec.ts index 5a4c42a9c8..84fe7c230e 100644 --- a/packages/api/src/endpoints/google/youtube.spec.ts +++ b/packages/api/src/endpoints/google/youtube.spec.ts @@ -78,6 +78,54 @@ describe('extractYouTubeUrls', () => { expect(extractYouTubeUrls(text)).toEqual([WATCH('dQw4w9WgXcQ')]); }); + it('extracts both URLs from a comma-separated pair in one token', () => { + const text = 'https://youtu.be/aaaaaaaaaaa,https://youtu.be/bbbbbbbbbbb'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('aaaaaaaaaaa'), WATCH('bbbbbbbbbbb')]); + }); + + it('extracts adjacent markdown-style links', () => { + const text = '[a](https://youtu.be/aaaaaaaaaaa)[b](https://youtu.be/bbbbbbbbbbb)'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('aaaaaaaaaaa'), WATCH('bbbbbbbbbbb')]); + }); + + it('extracts adjacent links separated by semicolons or pipes', () => { + expect(extractYouTubeUrls('https://youtu.be/aaaaaaaaaaa;https://youtu.be/bbbbbbbbbbb')).toEqual( + [WATCH('aaaaaaaaaaa'), WATCH('bbbbbbbbbbb')], + ); + expect(extractYouTubeUrls('https://youtu.be/aaaaaaaaaaa|https://youtu.be/bbbbbbbbbbb')).toEqual( + [WATCH('aaaaaaaaaaa'), WATCH('bbbbbbbbbbb')], + ); + }); + + it('finds a nested youtu.be link inside an unrecognized YouTube URL', () => { + const text = 'https://www.youtube.com/redirect?q=https://youtu.be/dQw4w9WgXcQ'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('finds a nested youtu.be link inside a watch URL with no own v=', () => { + const text = 'https://www.youtube.com/watch?url=https://youtu.be/dQw4w9WgXcQ'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('skips a malformed v= and uses a later valid one', () => { + const text = 'https://www.youtube.com/watch?v=tooShort&list=x&v=dQw4w9WgXcQ'; + expect(extractYouTubeUrls(text)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('ignores unrecognized YouTube routes with no nested video', () => { + expect(extractYouTubeUrls('https://www.youtube.com/results?search_query=cats')).toEqual([]); + expect(extractYouTubeUrls('https://www.youtube.com/@SomeChannel')).toEqual([]); + }); + + it('matches capitalized watch/embed paths (case-insensitive)', () => { + expect(extractYouTubeUrls('https://www.youtube.com/WATCH?v=dQw4w9WgXcQ')).toEqual([ + WATCH('dQw4w9WgXcQ'), + ]); + expect(extractYouTubeUrls('https://www.youtube.com/EMBED/dQw4w9WgXcQ')).toEqual([ + WATCH('dQw4w9WgXcQ'), + ]); + }); + it('ignores non-YouTube URLs', () => { expect(extractYouTubeUrls('https://example.com/watch?v=dQw4w9WgXcQ')).toEqual([]); }); @@ -324,6 +372,23 @@ describe('appendYouTubeVideoParts', () => { expect(mediaParts).toHaveLength(1); expect((mediaParts[0] as Record).fileUri).toBe(WATCH('aaaaaaaaaaa')); }); + + it('strips a watch URL fully when it has a URL-valued param after the id (no orphan)', () => { + const text = + 'summarize https://www.youtube.com/watch?v=dQw4w9WgXcQ&next=https://example.com please'; + const result = appendYouTubeVideoParts({ enabled: true, text, content: text, max: 5 }); + + const parts = result as MessageContentComplex[]; + const textPart = parts.find((p) => (p as Record).type === 'text') as { + text: string; + }; + expect(textPart.text).toBe('summarize please'); + expect(textPart.text).not.toContain('example.com'); + + const mediaParts = parts.filter((p) => (p as Record).type === 'media'); + expect(mediaParts).toHaveLength(1); + expect((mediaParts[0] as Record).fileUri).toBe(WATCH('dQw4w9WgXcQ')); + }); }); describe('resolveYouTubeInjectionConfig', () => { @@ -363,3 +428,99 @@ describe('resolveYouTubeInjectionConfig', () => { expect(resolveYouTubeInjectionConfig({ provider: Providers.GOOGLE })).toEqual({ max: 1 }); }); }); + +describe('ReDoS safety', () => { + it('returns quickly for a long malformed watch token with no v= parameter', () => { + /** One ~600KB non-whitespace token; before the fix this exhibited quadratic backtracking. */ + const malicious = 'https://www.youtube.com/watch?'.repeat(20000); + const start = Date.now(); + const result = extractYouTubeUrls(malicious, 5); + const elapsed = Date.now() - start; + + expect(result).toEqual([]); + expect(elapsed).toBeLessThan(1000); + }); + + it('returns quickly for a malformed path of millions of slashes', () => { + const malicious = `https://www.youtube.com/${'/'.repeat(3_000_000)}`; + const start = Date.now(); + const result = extractYouTubeUrls(malicious, 5); + + expect(result).toEqual([]); + expect(Date.now() - start).toBeLessThan(1000); + }); + + it('returns quickly for many medium malformed watch tokens', () => { + const token = 'https://www.youtube.com/watch?'.repeat(50); // ~1.5KB malformed token + const malicious = `${token} `.repeat(2000); // ~3MB of whitespace-separated malformed tokens + const start = Date.now(); + const result = extractYouTubeUrls(malicious, 5); + + expect(result).toEqual([]); + expect(Date.now() - start).toBeLessThan(1000); + }); + + it('still extracts a valid URL that appears before a huge token', () => { + const text = `https://youtu.be/dQw4w9WgXcQ ${'x'.repeat(60000)}`; + expect(extractYouTubeUrls(text, 5)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('injects + strips quickly when a valid URL is followed by a huge malformed token', () => { + const text = `https://youtu.be/dQw4w9WgXcQ ${'https://www.youtube.com/watch?'.repeat(20000)}`; + const start = Date.now(); + const result = appendYouTubeVideoParts({ enabled: true, text, content: text, max: 5 }); + const elapsed = Date.now() - start; + + expect(elapsed).toBeLessThan(1000); + const mediaParts = (result as MessageContentComplex[]).filter( + (p) => (p as Record).type === 'media', + ); + expect(mediaParts).toHaveLength(1); + expect((mediaParts[0] as Record).fileUri).toBe(WATCH('dQw4w9WgXcQ')); + }); + + it('strips quickly when a valid URL precedes ~3MB of medium malformed tokens', () => { + /** Exercises the strip path: extraction finds the valid URL, then strip must stay bounded. */ + const tail = `${'https://www.youtube.com/watch?'.repeat(50)} `.repeat(2000); + const text = `https://youtu.be/dQw4w9WgXcQ ${tail}`; + const start = Date.now(); + const result = appendYouTubeVideoParts({ enabled: true, text, content: text, max: 5 }); + const elapsed = Date.now() - start; + + expect(elapsed).toBeLessThan(1000); + const mediaParts = (result as MessageContentComplex[]).filter( + (p) => (p as Record).type === 'media', + ); + expect(mediaParts).toHaveLength(1); + expect((mediaParts[0] as Record).fileUri).toBe(WATCH('dQw4w9WgXcQ')); + }); +}); + +describe('long-text handling (no content discarded)', () => { + it('extracts a URL that appears far into a long prompt', () => { + const text = `${'word '.repeat(40000)}see https://youtu.be/dQw4w9WgXcQ`; // ~200KB before the URL + expect(extractYouTubeUrls(text, 5)).toEqual([WATCH('dQw4w9WgXcQ')]); + }); + + it('strips the injected URL even when large context is prepended to the content', () => { + /** Mirrors AgentClient prepending file/quote context to `content` while extraction uses `text`. */ + const prompt = 'summarize https://youtu.be/dQw4w9WgXcQ'; + const prependedContext = 'attached context line.\n'.repeat(8000); // ~180KB preamble + const result = appendYouTubeVideoParts({ + enabled: true, + text: prompt, + content: prependedContext + prompt, + max: 5, + }); + + const parts = result as MessageContentComplex[]; + const textPart = parts.find((p) => (p as Record).type === 'text') as { + text: string; + }; + expect(textPart.text).not.toContain('youtu.be/dQw4w9WgXcQ'); + expect(textPart.text).toContain('attached context line.'); + const mediaParts = parts.filter((p) => (p as Record).type === 'media'); + expect(mediaParts).toHaveLength(1); + expect((mediaParts[0] as Record).fileUri).toBe(WATCH('dQw4w9WgXcQ')); + }); +}); diff --git a/packages/api/src/endpoints/google/youtube.ts b/packages/api/src/endpoints/google/youtube.ts index 58f3a08fe8..34cbb88534 100644 --- a/packages/api/src/endpoints/google/youtube.ts +++ b/packages/api/src/endpoints/google/youtube.ts @@ -60,37 +60,84 @@ export function resolveYouTubeInjectionConfig(params: { provider?: string; model return { max: isGemini25OrLater(params.model) ? DEFAULT_MAX_YOUTUBE_PARTS : 1 }; } +/** 11-char video id. */ +const ID = '[A-Za-z0-9_-]{11}'; /** - * Host + path alternation shared by the detection and strip regexes. Accepts any YouTube - * subdomain (`www.`, `m.`, `music.`, ...) for youtube.com plus youtu.be and youtube-nocookie - * embed links. The capture group is always the 11-char video id. + * Allowlist of characters that occur in a YouTube URL path/query; anything else ends the match. + * The detection tail excludes `:` — it never appears in a real YouTube path/query, but `://` of a + * *nested* URL does, so excluding it lets the scan stop and find e.g. + * `watch?url=https://youtu.be/`. The strip tail re-adds `:` so the whole URL token (including a + * trailing URL-valued param like `&next=https://example.com`) is consumed and nothing is orphaned. */ -const YOUTUBE_HOST_PATH = - '(?:(?:[a-z0-9-]+\\.)*youtube\\.com\\/(?:watch\\?(?:\\S*?&)?v=|shorts\\/|live\\/|embed\\/|v\\/)' + - '|(?:www\\.)?youtube-nocookie\\.com\\/embed\\/' + - '|youtu\\.be\\/)'; +const DETECT_TAIL = '[A-Za-z0-9\\-._~%/?&=#@+]*'; +const STRIP_TAIL = '[A-Za-z0-9\\-._~%/?&=#@+:]*'; /** - * Matches YouTube watch/share/shorts/live/embed (incl. youtube-nocookie) URLs and captures the - * 11-char video id. The leading lookbehind rejects hosts embedded in a longer domain (e.g. - * `notyoutube.com`, `evil-youtube.com`); the trailing lookahead rejects ids inside a longer token. + * Builds the linear YouTube URL matcher (restricted to recognized single-video forms): + * - `youtu.be/` (group 1 = id) + * - `youtube[-nocookie].com/(shorts|live|embed|v)/` (group 2 = id) + * - `youtube[-nocookie].com/watch?` (group 3 = query, `v=` parsed afterwards) + * + * Properties that matter: + * - Linear (no ReDoS): each branch consumes its match in a single greedy pass, the `v=` location + * is parsed from the captured query (not re-scanned per occurrence), and the subdomain + * repetition is bounded (`{1,63}` label x `{0,10}` labels). Needs no scan caps. + * - Restricting to recognized routes means an UNrecognized YouTube URL (e.g. `/redirect?q=...`) + * does not match, so the global scan continues and still finds a nested `youtu.be/`. + * - The path/query allowlist ends the match at prose delimiters, so adjacent links in one token + * (comma/semicolon/pipe-separated, markdown `](url1)](url2)`) are matched separately. + * - The leading lookbehind rejects hosts embedded in a longer domain (`notyoutube.com`, + * `evil-youtube.com`). */ -const YOUTUBE_URL_REGEX = new RegExp( - `(?): string { - const replaced = text.replace(YOUTUBE_URL_STRIP_REGEX, (match, videoId) => { - if (!injectedIds.has(videoId)) { + if (text.length === 0) { + return text; + } + let changed = false; + const replaced = text.replace(YOUTUBE_STRIP_REGEX, (match, youtuBeId, pathId, watchQuery) => { + const id = videoIdFromGroups(youtuBeId, pathId, watchQuery); + if (id == null || !injectedIds.has(id)) { return match; } /** Test the whole URL: the timestamp can precede `v=` (e.g. `watch?t=90&v=`). */ if (YOUTUBE_TIMESTAMP_REGEX.test(match)) { return match; } + changed = true; return ''; }); - if (replaced === text) { + if (!changed) { return text; } return replaced @@ -139,8 +192,8 @@ export function extractYouTubeUrls(text?: string | null, max?: number): string[] YOUTUBE_URL_REGEX.lastIndex = 0; let match: RegExpExecArray | null; while ((match = YOUTUBE_URL_REGEX.exec(text)) !== null) { - const videoId = match[1]; - if (seen.has(videoId)) { + const videoId = videoIdFromGroups(match[1], match[2], match[3]); + if (videoId == null || seen.has(videoId)) { continue; } seen.add(videoId);