🩹 fix: LibreOffice PDF embed uses blob: URL (Chrome blocks data: PDFs)

Manual e2e on PR #12934: enabling `OFFICE_PREVIEW_LIBREOFFICE=true`
on a host with `soffice` installed surfaced "This page has been
blocked by Chrome" inside the PDF preview iframe.

Root cause: Chrome blocks `data:application/pdf;base64,...`
navigations inside sandboxed iframes (anti-phishing measure since
Chrome 76, see crbug.com/863001). The Sandpack iframe IS sandboxed
(its `sandbox="..."` attribute lacks `allow-top-navigation` for
data: URLs specifically), so when our inner `<iframe src="data:
application/pdf;...">` tries to navigate, Chrome's interstitial
fires and renders the "blocked" message.

Fix: switch from `data:` URL to `blob:` URL. The bootstrap now:
  1. Reads the base64 payload from a `<script type="application/
     octet-stream;base64">` data block (same pattern as the DOCX
     and PPTX wrappers).
  2. Decodes via `atob` + `Uint8Array.from`.
  3. Creates a `Blob` with `type: 'application/pdf'`.
  4. `URL.createObjectURL(blob)` produces a same-origin blob: URL.
  5. Sets `pdfFrame.src = url + '#view=FitH'` — Chrome treats blob:
     URLs as legitimate navigation and serves the built-in PDF
     viewer.

CSP updated: `frame-src blob:` (was `frame-src data:`). `data:` is
now explicitly NOT allowed in `frame-src` since Chrome would block
it anyway in our context — keeping it would be misleading
documentation.

Bonus: failure paths now log to `console.error` with a
`[libreoffice-pdf]` prefix so DevTools surfaces blob-creation
failures and PDF-viewer load timeouts in red.

Tests updated:
- "emits a complete sandboxed HTML document" now asserts the
  data-block + blob URL construction (not the old data: URL).
- New CSP test "allows blob: in frame-src (NOT data:)" with both
  positive and negative assertions to lock in the change.
- Integration test for `tryLibreOfficePreview` updated to look for
  the data block + `URL.createObjectURL` instead of the data: URL.
- Large-payload test now verifies the data block round-trip rather
  than data: URL escaping (base64 alphabet has no characters that
  break out of `<script>` anyway).
This commit is contained in:
Danny Avila 2026-05-04 21:09:18 -04:00
parent 06c3bfa3c0
commit d90f26c11c
2 changed files with 100 additions and 42 deletions

View file

@ -80,22 +80,40 @@ describe('libreoffice (env gating + wrapper)', () => {
* wrappers. */
const FAKE_PDF_B64 = 'JVBERi0xLjQK'; // "%PDF-1.4\n" base64
it('emits a complete sandboxed HTML document', () => {
it('emits a complete sandboxed HTML document with PDF bytes embedded as a data block', () => {
const html = buildPdfEmbedDocument(FAKE_PDF_B64);
expect(html).toMatch(/^<!DOCTYPE html>/);
expect(html).toContain('<title>Preview</title>');
expect(html).toContain('id="lc-pdf"');
expect(html).toContain(`data:application/pdf;base64,${FAKE_PDF_B64}`);
/* PDF bytes live in a `<script type="application/octet-stream;base64">`
* data block the bootstrap reads it and constructs a blob: URL
* at runtime. We deliberately do NOT use `<iframe src="data:
* application/pdf;...">` because Chrome blocks data: navigations
* inside sandboxed iframes (manual e2e on PR #12934 surfaced
* as the "This page has been blocked by Chrome" interstitial).
* blob: URLs are same-origin and bypass that restriction. */
expect(html).toContain('id="lc-pdf-data"');
expect(html).toContain(FAKE_PDF_B64);
expect(html).not.toMatch(/src="data:application\/pdf/);
/* The bootstrap code that converts the base64 to a blob URL. */
expect(html).toContain('URL.createObjectURL');
expect(html).toContain('new Blob');
expect(html).toContain("type: 'application/pdf'");
});
it('CSP locks the iframe down: no script CDN, no outbound connect, no eval', () => {
it('CSP allows blob: in frame-src (NOT data:) and locks the iframe down otherwise', () => {
const html = buildPdfEmbedDocument(FAKE_PDF_B64);
const cspMatch = html.match(/<meta http-equiv="Content-Security-Policy" content="([^"]+)">/);
expect(cspMatch).not.toBeNull();
const csp = cspMatch![1];
expect(csp).toMatch(/default-src 'none'/);
/* The whole point: data: URIs in nested iframes (browser PDF viewer). */
expect(csp).toMatch(/frame-src data:/);
/* blob: in frame-src, NOT data: Chrome blocks data:application/pdf
* navigations inside sandboxed iframes (anti-phishing measure
* since Chrome 76). The bootstrap creates blob: URLs at runtime
* which Chrome treats as same-origin and allows. Manual e2e on
* PR #12934. */
expect(csp).toMatch(/frame-src[^;]*\bblob:/);
expect(csp).not.toMatch(/frame-src[^;]*\bdata:/);
/* No outbound HTTP from the rendered iframe a malicious PDF
* can't beacon home from inside the viewer. */
expect(csp).toMatch(/connect-src 'none'/);
@ -113,6 +131,8 @@ describe('libreoffice (env gating + wrapper)', () => {
expect(html).toContain('PDF preview unavailable in this browser');
/* The 4-second heuristic timer that swaps to the fallback. */
expect(html).toContain('4000');
/* Reasons logged to console.error for power-user debugging. */
expect(html).toContain("console.error('[libreoffice-pdf] fallback fired:'");
});
it('uses #view=FitH so the PDF fills the panel width on first paint', () => {
@ -120,16 +140,18 @@ describe('libreoffice (env gating + wrapper)', () => {
expect(html).toContain('#view=FitH');
});
it('embeds large base64 payloads without breaking out of the data URI', () => {
/* A `</script>` substring in the base64 wouldn't terminate the
* iframe `src=` attribute, but a stray `"` would. base64 alphabet
* is `A-Za-z0-9+/=` none of those are dangerous. Sanity-check
* with a synthetically large payload. */
it('embeds large base64 payloads inside the data block without escaping issues', () => {
/* The base64 alphabet (A-Za-z0-9+/=) contains no characters that
* could break out of `<script type="application/octet-stream;
* base64">...</script>` — base64 cannot contain `<`, `>`, `&`, or
* quote characters. Sanity-check that the data round-trips. */
const big = 'A'.repeat(100_000);
const html = buildPdfEmbedDocument(big);
const src = html.match(/src="data:application\/pdf;base64,([^"]+)/);
expect(src).not.toBeNull();
expect(src![1].length).toBe(big.length + '#view=FitH'.length);
const dataBlock = html.match(
/<script id="lc-pdf-data" type="application\/octet-stream;base64">([^<]+)<\/script>/,
);
expect(dataBlock).not.toBeNull();
expect(dataBlock![1]).toBe(big);
});
});
@ -274,7 +296,11 @@ describe('libreoffice integration (skipped unless LibreOffice is on $PATH)', ()
const out = await tryLibreOfficePreview(buf, 'docx', 512 * 1024);
expect(out).not.toBeNull();
expect(out!).toMatch(/^<!DOCTYPE html>/);
expect(out!).toContain('data:application/pdf;base64,');
/* PDF bytes are embedded as a base64 data block (not as a data:
* URL Chrome blocks data:application/pdf in sandboxed iframes;
* the bootstrap converts to a blob: URL at runtime). */
expect(out!).toContain('id="lc-pdf-data"');
expect(out!).toContain('URL.createObjectURL');
expect(Buffer.byteLength(out!, 'utf-8')).toBeLessThanOrEqual(512 * 1024);
},
35_000,

View file

@ -258,31 +258,42 @@ function runConversion(binary: string, inputPath: string, tempDir: string): Prom
* `<iframe>` so the host browser's PDF viewer (PDF.js in Firefox, Chrome's
* built-in viewer, Safari's Preview-driven viewer) can render it directly.
*
* Why an inner iframe rather than `<embed>` or `<object>`:
* - `<embed>` and `<object>` rendering is least consistent across modern
* browsers (Chrome's pdfium plugin requires CSP `object-src data:`,
* and some headless contexts disable it).
* - `<iframe src="data:application/pdf;base64,...">` is the most
* reliable cross-browser path. Chromium's PDF viewer treats it as a
* top-level navigation and serves the built-in viewer.
* Why blob: URL (vs data: URL):
* Chrome blocks `data:application/pdf` navigations inside sandboxed
* iframes (anti-phishing measure since Chrome 76 surfaces as a
* "This page has been blocked by Chrome" interstitial). The Sandpack
* iframe IS sandboxed, so the inner iframe's data: navigation hits
* that block. Constructing a `blob:` URL at runtime via
* `URL.createObjectURL(new Blob([bytes], {type: 'application/pdf'}))`
* produces a same-origin URL that Chrome treats as legitimate
* navigation works inside sandboxed contexts where data: doesn't.
* Manual e2e on PR #12934.
*
* The inner iframe is fully sandboxed no script, same-origin, etc.
* and uses `#view=FitH` to size to the panel's width on first paint.
* Why an inner iframe rather than `<embed>` or `<object>`:
* `<embed>` and `<object>` rendering is least consistent across modern
* browsers (Chrome's pdfium plugin requires CSP `object-src` and
* some headless contexts disable it). `<iframe src="blob:...">` is
* the most reliable cross-browser path; Chromium/Firefox/Safari all
* serve their built-in PDF viewer for it.
*
* The inner iframe uses `#view=FitH` to size to the panel's width
* on first paint.
*/
export function buildPdfEmbedDocument(pdfBase64: string): string {
/* CSP scoping:
* - `default-src 'none'`: lock everything down.
* - `frame-src data:`: allow the inner `<iframe src="data:application/pdf;...">`.
* - `object-src 'self' data:`: belt-and-suspenders for browsers that
* route PDFs through `<object>` via the iframe sandbox quirk.
* - `script-src 'unsafe-inline'`: only our tiny load-detector script.
* - `frame-src blob:`: allow the inner `<iframe src="blob:...">`
* navigation that the bootstrap creates from the PDF bytes.
* `data:` is intentionally NOT in `frame-src` because Chrome
* blocks it in sandboxed contexts anyway.
* - `script-src 'unsafe-inline'`: only our tiny bootstrap script.
* - `style-src 'unsafe-inline'`: page chrome (no external sheets).
* - `connect-src 'none'`: rendered iframe makes no network calls.
*/
const csp = [
"default-src 'none'",
'frame-src data:',
"object-src 'self' data:",
'frame-src blob:',
"object-src 'self' blob:",
"script-src 'unsafe-inline'",
"style-src 'unsafe-inline'",
"img-src 'self' data: blob:",
@ -307,26 +318,47 @@ html, body { margin: 0; padding: 0; height: 100%; background: var(--bg); color:
</style>
</head>
<body>
<iframe id="lc-pdf" src="data:application/pdf;base64,${pdfBase64}#view=FitH" title="PDF preview"></iframe>
<iframe id="lc-pdf" title="PDF preview"></iframe>
<div id="lc-fallback">PDF preview unavailable in this browser. Please download the file to view it.</div>
<script id="lc-pdf-data" type="application/octet-stream;base64">${pdfBase64}</script>
<script>
(function () {
/* Some browsers / kiosk profiles disable the built-in PDF viewer; the
* iframe loads but stays blank. We can't reliably detect the inner
* viewer's success across browsers, so we use a 4-second heuristic:
* if the iframe never reports a load event by then, swap to the
* fallback message. False negatives (slow networks, cold viewers)
* are acceptable the user can still download the file. */
var pdfFrame = document.getElementById('lc-pdf');
var fallback = document.getElementById('lc-fallback');
var loaded = false;
if (pdfFrame) {
pdfFrame.addEventListener('load', function () { loaded = true; });
if (!pdfFrame || !fallback) { return; }
function showFallback(reason) {
pdfFrame.style.display = 'none';
fallback.classList.add('visible');
if (reason && typeof console !== 'undefined' && console.error) {
console.error('[libreoffice-pdf] fallback fired:', reason);
}
}
/* Decode the embedded base64 and create a blob: URL. Chrome blocks
* data:application/pdf in sandboxed iframes (parent Sandpack iframe
* is sandboxed); blob: URLs are treated as same-origin and bypass
* that restriction. Manual e2e on PR #12934 "This page has been
* blocked by Chrome" interstitial was the symptom. */
var loaded = false;
try {
var b64 = document.getElementById('lc-pdf-data').textContent.trim();
var bytes = Uint8Array.from(atob(b64), function (c) { return c.charCodeAt(0); });
var blob = new Blob([bytes], { type: 'application/pdf' });
var url = URL.createObjectURL(blob);
pdfFrame.addEventListener('load', function () { loaded = true; });
pdfFrame.src = url + '#view=FitH';
} catch (err) {
showFallback((err && err.message) || 'blob-creation-failed');
return;
}
/* 4-second heuristic: if the iframe never reports a load event by
* then, the host browser PDF viewer is probably disabled (kiosk
* profile, Brave Shields, etc.). Swap to the fallback message. */
setTimeout(function () {
if (!loaded && fallback) {
if (pdfFrame) { pdfFrame.style.display = 'none'; }
fallback.classList.add('visible');
if (!loaded) {
showFallback('pdf-viewer-load-timeout');
}
}, 4000);
})();