diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json new file mode 100644 index 00000000..ddde0dbc --- /dev/null +++ b/.claude-plugin/marketplace.json @@ -0,0 +1,9 @@ +{ + "plugins": [ + { + "name": "js-asset-auditor", + "description": "Audit publisher pages for third-party JS assets and generate js-assets.toml entries using Playwright", + "path": "packages/js-asset-auditor" + } + ] +} diff --git a/.claude/settings.json b/.claude/settings.json index 02b602d4..8a63480b 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -23,9 +23,13 @@ "Bash(git diff:*)", "Bash(git log:*)", "Bash(git status:*)", + "Bash(node scripts/js-asset-slug.mjs:*)", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page", - "mcp__plugin_chrome-devtools-mcp_chrome-devtools__performance_stop_trace", + "mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page", + "mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script", + "mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page", + "mcp__plugin_chrome-devtools-mcp_chrome-devtools__performance_stop_trace" ] }, "enabledPlugins": { diff --git a/.gitignore b/.gitignore index af70c452..f9bc0ef2 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,9 @@ src/*.html /guest-profiles /benchmark-results/** +# JS Asset Auditor plugin +/packages/js-asset-auditor/node_modules/ + # Playwright browser tests /crates/integration-tests/browser/node_modules/ /crates/integration-tests/browser/test-results/ diff --git a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md new file mode 100644 index 00000000..207410b4 --- /dev/null +++ b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md @@ -0,0 +1,366 @@ +# JS Asset Auditor — Engineering Spec + +**Date:** 2026-04-01 +**Status:** Approved for engineering breakdown +**Related:** [JS Asset Proxy spec](2026-04-01-js-asset-proxy-design.md) _(on `js-asset-proxy-spec` branch until merged)_ + +--- + +## Context + +The JS Asset Proxy requires a `js-assets.toml` file declaring which third-party JS assets to proxy. Without tooling, populating this file requires manually inspecting network requests in browser DevTools, extracting URLs, generating opaque slugs, and writing TOML — a tedious error-prone process that is a barrier to publisher onboarding. + +The Auditor eliminates this friction. It sweeps a publisher's page using Playwright (headless Chromium), detects third-party JS assets, auto-generates `js-assets.toml` entries, and auto-detects `inject_in_head` from the page DOM. The operator's only remaining decision is reviewing the output before committing. + +It also runs as a monitoring tool — `--diff` mode compares a new sweep against the existing config and surfaces new or removed assets, giving publishers ongoing visibility into their third-party JS footprint. + +**Implementation:** Claude Code plugin at `packages/js-asset-auditor/` containing a standalone Playwright CLI, a processing library, and a skill definition. No Rust, no compiled code. Can also be run directly without Claude Code. + +--- + +## Command Interface + +```bash +# Via Claude Code plugin skill +/js-asset-auditor:audit-js-assets https://www.publisher.com # init — generate js-assets.toml +/js-asset-auditor:audit-js-assets https://www.publisher.com --diff # diff — compare against existing file +/js-asset-auditor:audit-js-assets https://www.publisher.com --settle 15000 # longer settle for ad-tech-heavy pages +/js-asset-auditor:audit-js-assets https://www.publisher.com --no-filter # bypass heuristic filtering +/js-asset-auditor:audit-js-assets https://www.publisher.com --headless # headless mode for CI/automation +/js-asset-auditor:audit-js-assets https://www.publisher.com --config # also generate trusted-server.toml + +# Direct CLI invocation (no Claude Code required) +node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com +node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com --domain publisher.com +node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com --diff --output js-assets.toml +node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com --config my-config.toml +``` + +--- + +## Sweep Protocol + +The CLI (`packages/js-asset-auditor/lib/audit.mjs`) performs the full sweep: + +1. Resolve publisher domain: `--domain` flag → `trusted-server.toml` → infer from target URL +2. Launch Chromium via Playwright (headed by default to avoid bot detection; `--headless` for CI) +3. Register a response listener for `resourceType() === 'script'` to capture all script network requests +4. Navigate to target URL (`page.goto`, 30s timeout, follows redirects transparently) +5. Wait for page load settle: `page.waitForTimeout(SETTLE_MS)` where `SETTLE_MS` defaults to 6000 (configurable via `--settle `) +6. Evaluate `document.head.querySelectorAll('script[src]')` to collect head-loaded script URLs +7. Close browser +8. Pass collected URLs to `processAssets()` from `lib/process.mjs` — applies URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation +9. Write `js-assets.toml` output (init or diff mode) +10. Print JSON summary to stdout (progress lines go to stderr) + +**`inject_in_head` semantics:** The DOM snapshot in step 6 captures the final state of `` after the settle window. Scripts that were briefly inserted and then removed by a loader will not appear. This is intentional — `inject_in_head = true` means "the script is present in `` at page-stable state." If a loader removes it before the snapshot, the proxy should not re-inject it. + +--- + +## URL Processing + +### First-party boundary + +A network request is **first-party** if the request URL's host, after stripping a leading `www.`, matches the publisher domain after the same stripping. Matching is exact on the resulting strings. + +**Domain resolution order:** `--domain ` flag → `publisher.domain` from `trusted-server.toml` → inferred from the target URL's hostname. This makes the tool usable in any project — `trusted-server.toml` is not required. + +**Auto-detection:** The target URL's hostname is automatically included as first-party, in addition to the resolved publisher domain. This ensures that auditing `https://golf.com` when `publisher.domain = "test-publisher.com"` correctly excludes `golf.com` scripts without requiring `--first-party golf.com`. + +Publisher-owned CDN subdomains (e.g., `cdn.publisher.com`, `static.publisher.com`) are treated as third-party by default. If the publisher wants to exclude them, they can be added via `--first-party cdn.publisher.com`. + +### URL normalization + +Applied to every captured script URL before slug generation and before persisting `origin_url`: + +1. Strip fragment (`#...`) +2. Strip all query parameters — cache-busters (`?v=123`, `?cb=timestamp`), consent params, and session tokens all live in query strings. JS asset versioning uses path segments, not query params. +3. Strip trailing slash from the path + +The normalized URL is what gets stored in `origin_url` and fed into the slug hash. + +--- + +## Heuristic Filter + +The following origin categories are excluded silently. The terminal summary reports what was filtered and why so operators can manually add entries if needed. + +**Matching:** Filter entries match if the request URL's host ends with the filter entry, with a dot-boundary check. For example, `googletagmanager.com` in the filter matches `www.googletagmanager.com` but not `evil-googletagmanager.com`. + +| Category | Excluded origins | +| ------------------- | --------------------------------------------------------------------------------------------- | +| Framework CDNs | `cdnjs.cloudflare.com`, `ajax.googleapis.com`, `cdn.jsdelivr.net`, `unpkg.com` | +| Error tracking | `sentry.io`, `bugsnag.com`, `rollbar.com` | +| Font services | `fonts.googleapis.com`, `fonts.gstatic.com` | +| Social embeds | `platform.twitter.com`, `platform.x.com`, `connect.facebook.net` | +| Google ad rendering | `pagead2.googlesyndication.com`, `tpc.googlesyndication.com`, `s0.2mdn.net`, | +| | `googleads.g.doubleclick.net`, `www.googleadservices.com` | +| Ad fraud detection | `adtrafficquality.google` | +| Ad verification | `adsafeprotected.com`, `moatads.com`, `doubleverify.com` | +| reCAPTCHA | `recaptcha.net`, `www.google.com/recaptcha/*`, `www.gstatic.com/recaptcha/*` | + +**Path-prefix matching:** Some hosts (e.g., `www.google.com`) serve both filterable and non-filterable resources. Entries with a path suffix (e.g., `www.google.com/recaptcha/*`) match only when the URL's path begins with the specified prefix. Plain host entries use dot-boundary suffix matching as before. + +**`googletagmanager.com` is not filtered** — GTM is ad tech and should be proxied. + +**`securepubads.g.doubleclick.net` is not filtered** — this is the GPT ad server SDK. Publishers deliberately place this tag. Its sub-resources (e.g., `pubads_impl.js`) are also intentional. The filter targets ad-rendering infrastructure (iframes, creatives, verification), not ad-serving SDKs. + +**`--no-filter`** bypasses heuristic filtering entirely, surfacing all non-first-party scripts. First-party filtering always applies. + +Everything else surfaces for operator review. + +--- + +## Asset Entry Generation + +| Field | Derivation | +| ---------------- | --------------------------------------------------------------------------------------------------- | +| `slug` | `{publisher_prefix}:{asset_stem}` — see slug algorithm below | +| `path` | Fixed: `/js-assets/{publisher_prefix}/{asset_stem}.js`. Wildcard: `/js-assets/{publisher_prefix}/*` | +| `origin_url` | Normalized URL (see URL Processing), with wildcard substitution applied if versioned | +| `ttl_sec` | Omitted — proxy defaults to 1800 (wildcard) or 3600 (fixed) | +| `stale_ttl_sec` | Omitted — proxy defaults to 86400 (24h) | +| `inject_in_head` | `true` if URL appeared in head script list from DOM evaluation, else `false` | + +### Slug algorithm + +``` +publisher_prefix = first_8_chars(base62(sha256(publisher.domain + "|" + origin_url))) +asset_stem = filename_without_extension(origin_url) +slug = "{publisher_prefix}:{asset_stem}" +``` + +The pipe (`|`) separator is required — it cannot appear in domain names or at the start of a URL, so the hash input is unambiguous. The `origin_url` fed into the hash must be the normalized URL (see URL Processing). + +**base62 charset:** `0-9A-Za-z` (digits first, then uppercase, then lowercase). This matches the `base62` crate convention. + +**Rationale:** Fully opaque and hash-derived — no human naming required, no ambiguity for cryptic vendor filenames. The KV metadata (`origin_url`, `content_type`, `asset_slug`) serves as the lookup table. Operators can query `js-asset:{slug}` in the KV store to retrieve full provenance. The terminal summary also prints slug → origin_url at generation time. + +**Important:** This algorithm must produce identical output to the Proxy's KV key derivation. The reference implementation lives in `packages/js-asset-auditor/lib/slug.mjs` (standalone CLI) and `packages/js-asset-auditor/lib/process.mjs` (processing library), with a copy in `scripts/js-asset-slug.mjs`. Any changes must be synchronized across all files and the Rust proxy. + +### Wildcard detection + +Path segments matching any of these patterns are replaced with `*`: + +- Semver: `\d+\.\d+[\.\d\w-]*` (e.g., `1.19.8-hcskhn`) +- Hex hash: `[a-f0-9]{8,}` between path separators (lowercase hex, minimum 8 characters) +- Mixed alphanumeric hash: `[A-Za-z0-9]{8,}` between path separators, **must contain at least one digit and at least one letter** — this excludes pure-alpha dictionary words like `analytics` or `bootstrap` + +The original URL is preserved as a comment above the generated entry so operators can verify the wildcard substitution is correct. + +--- + +## Init Mode Output + +### `js-assets.toml` (written to repo root) + +```toml +# Generated by /audit-js-assets on 2026-04-01 +# Publisher: publisher.com +# Source URL: https://www.publisher.com + +[[js_assets]] +# https://web.prebidwrapper.com/golf-WnLmpLyEjL/default-v2/prebid-load.js +slug = "aB3kR7mN:prebid-load" +path = "/js-assets/aB3kR7mN/prebid-load.js" +origin_url = "https://web.prebidwrapper.com/golf-WnLmpLyEjL/default-v2/prebid-load.js" +inject_in_head = true + +[[js_assets]] +# https://raven-static.vendor.io/prod/1.19.8-hcskhn/raven.js (wildcard detected) +slug = "xQ9pL2wY:raven" +path = "/js-assets/xQ9pL2wY/*" +origin_url = "https://raven-static.vendor.io/prod/*/raven.js" +inject_in_head = false +``` + +### Terminal summary + +``` +JS Asset Audit — publisher.com +──────────────────────────────── +Detected: 8 third-party JS requests +Filtered: 3 (cdnjs.cloudflare.com ×2, sentry.io ×1) +Surfaced: 5 assets → js-assets.toml + + aB3kR7mN inject_in_head=true web.prebidwrapper.com/.../prebid-load.js + xQ9pL2wY inject_in_head=false raven-static.vendor.io/prod/*/raven.js [wildcard] + zM4nK8vP inject_in_head=true googletagmanager.com/gtm.js + ... + +Review inject_in_head values and commit js-assets.toml when ready. +Diff mode: /audit-js-assets --diff +``` + +--- + +## Diff Mode Output + +Compares sweep results against the existing `js-assets.toml`. + +| Condition | Behavior | +| --------------------------- | ----------------------------------------------------------------------- | +| Asset in sweep, not in file | **New** — appended to `js-assets.toml` as a commented-out block | +| Asset in file, not in sweep | **Missing** — flagged in terminal summary with `⚠`. Never auto-removed. | +| Asset in both | **Confirmed** — listed as present | + +New entries are appended as TOML comments so the file stays valid and nothing is activated without the operator explicitly uncommenting. + +### `js-assets.toml` (new entry appended as comment) + +```toml +# --- NEW (detected by /audit-js-assets --diff on 2026-04-01, uncomment to activate) --- +# [[js_assets]] +# # https://googletagmanager.com/gtm.js +# slug = "zM4nK8vP:gtm" +# path = "/js-assets/zM4nK8vP/gtm.js" +# origin_url = "https://googletagmanager.com/gtm.js" +# inject_in_head = true +``` + +### Terminal summary (diff mode) + +``` +JS Asset Audit (diff) — publisher.com +──────────────────────────────── +Confirmed: 4 assets still present on page +New: 1 asset detected (appended as comment to js-assets.toml) +Missing: 1 asset no longer seen on page ⚠ + + NEW zM4nK8vP googletagmanager.com/gtm.js → review in js-assets.toml + MISSING xQ9pL2wY raven-static.vendor.io/... → may have been removed or renamed +``` + +--- + +## Integration Detection & Config Generation + +When invoked with `--config [path]`, the CLI also detects known integrations from the swept URLs and generates a `trusted-server.toml` with appropriate `[integrations.*]` sections. + +### Detection patterns + +Integration detection runs on raw URLs (before normalization) to preserve query parameters needed for field extraction. + +| URL Pattern | Integration | Extracted Fields | +| -------------------------------------------------- | ---------------------- | ----------------------------------------- | +| `securepubads.g.doubleclick.net/tag/js/gpt*` | `gpt` | `script_url` | +| `www.googletagmanager.com/gtm.js?id=GTM-XXX` | `google_tag_manager` | `container_id` from `?id=` | +| `sdk.privacy-center.org` | `didomi` | (defaults) | +| `js.datadome.co` | `datadome` | (defaults) | +| `aim.loc.kr/*identity-lockr*.js` | `lockr` | `sdk_url` | +| `*.edge.permutive.app/*-web.js` | `permutive` | `organization_id`, `workspace_id` from URL | +| `*/prebid.js`, `*/prebidjs.js` (+ .min variants) | `prebid` | (detect only) | +| `c.amazon-adsystem.com/aax2/apstag*` | `aps` | (detect only) | + +### Field categories + +- **Full** — all config fields have defaults or are auto-extracted. Config section is ready to use. +- **Partial** — some fields auto-extracted, others need manual input (marked with `# TODO:`). +- **Detect only** — integration detected but key fields (e.g., `server_url`, `pub_id`) require manual input. + +### Config output + +```toml +# Generated by js-asset-auditor on 2026-04-13 +# Source URL: https://www.publisher.com + +[publisher] +domain = "publisher.com" +# cookie_domain = ".publisher.com" +# origin_url = "https://origin.publisher.com" +# proxy_secret = "change-me" + +[integrations.gpt] +enabled = true +script_url = "https://securepubads.g.doubleclick.net/tag/js/gpt.js" # auto-detected +# cache_ttl_seconds = 3600 +# rewrite_script = true + +[integrations.google_tag_manager] +enabled = true +container_id = "GTM-TRCJMD6" # auto-detected + +[integrations.lockr] +enabled = true +sdk_url = "https://aim.loc.kr/identity-lockr-trust-server.js" # auto-detected +app_id = "" # TODO: set your Lockr Identity app_id +# api_endpoint = "https://identity.loc.kr" +``` + +If the target file already exists, the CLI errors unless `--force` is passed. + +--- + +## Implementation + +The Auditor is packaged as a Claude Code plugin at `packages/js-asset-auditor/` with three components: + +``` +packages/js-asset-auditor/ +├── .claude-plugin/plugin.json # Plugin manifest +├── skills/audit-js-assets/SKILL.md # Skill definition +├── bin/audit-js-assets # Executable (added to PATH by Claude Code) +├── lib/ +│ ├── audit.mjs # Playwright CLI — browser automation + orchestration +│ ├── detect.mjs # Integration detection engine + config generation +│ ├── process.mjs # Processing library — normalization, filtering, slugs, TOML +│ └── slug.mjs # Standalone slug generator +├── package.json # playwright dependency +└── settings.json # Auto-grants Bash(audit-js-assets:*) permission +``` + +1. **Playwright CLI** (`lib/audit.mjs`) — Launches Chromium (headed by default), navigates to the target URL, collects script network requests and head script DOM state, then calls `processAssets()`. Outputs TOML file + JSON summary. Can be run directly without Claude Code. +2. **Processing library** (`lib/process.mjs`) — Pure Node.js module (no external dependencies) that exports `processAssets()` and individual utility functions. Handles URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation, and TOML formatting. +3. **Claude Code skill** (`skills/audit-js-assets/SKILL.md`) — Thin wrapper that invokes the CLI via the `bin/audit-js-assets` executable and formats the JSON summary. + +**Plugin installation:** + +```bash +# Local testing (loads for one session) +claude --plugin-dir packages/js-asset-auditor + +# Via marketplace (permanent installation) +/plugin marketplace add / +/plugin install js-asset-auditor +``` + +**Setup (one-time after install):** + +```bash +cd packages/js-asset-auditor && npm install && npx playwright install chromium +``` + +**Standalone utilities:** + +- `scripts/js-asset-slug.mjs` — Standalone slug generator for individual URLs (kept outside the plugin for backward compatibility) + +--- + +## Delivery Order + +The Auditor should be delivered **after Proxy Phase 1** (so `js-assets.toml` schema is defined) and **before Proxy Phase 2** (so engineering has real populated entries to test the cache pipeline against actual vendor origins). + +See [delivery order in the Proxy spec](2026-04-01-js-asset-proxy-design.md) _(on `js-asset-proxy-spec` branch until merged)_. + +--- + +## Verification + +- Run `node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com` against a known test publisher page +- Verify generated entries match actual third-party JS observed on the page (cross-check in browser DevTools) +- Verify `inject_in_head = true` only for scripts that appear in `` (not ``) +- Verify wildcard detection fires for versioned path segments (e.g., `1.19.13-0fnlww`) and not for stable paths +- Verify GTM (`googletagmanager.com`) is captured and not filtered +- Verify Google ad rendering infra (`pagead2.googlesyndication.com`, `s0.2mdn.net` etc.) is filtered with reason in summary +- Verify `securepubads.g.doubleclick.net` (GPT) is **not** filtered +- Verify first-party auto-detection: auditing `golf.com` with `publisher.domain = "test-publisher.com"` excludes `golf.com` scripts +- Run `--diff` against an unchanged page → all entries confirmed, no new/missing +- Run `--diff` after adding a new vendor script to the page → appears as `NEW` in summary +- Run `--diff` after removing a script → appears as `MISSING ⚠` in summary, file unchanged +- Run `/js-asset-auditor:audit-js-assets ` via Claude Code plugin → identical results to direct CLI invocation +- Run CLI without `trusted-server.toml` (using `--domain` or domain inference) → works in any project +- Run with `--config` → generates `trusted-server.toml` with detected integrations +- Verify GTM `container_id` is auto-extracted from `?id=GTM-XXXXX` query param +- Verify integrations with TODO fields are marked with `# TODO:` comments +- Verify `--config` without `--force` errors when target file exists +- Verify JSON summary includes `integrations` array when `--config` is used diff --git a/packages/js-asset-auditor/.claude-plugin/plugin.json b/packages/js-asset-auditor/.claude-plugin/plugin.json new file mode 100644 index 00000000..b6be8d85 --- /dev/null +++ b/packages/js-asset-auditor/.claude-plugin/plugin.json @@ -0,0 +1,10 @@ +{ + "name": "js-asset-auditor", + "version": "1.0.0", + "description": "Audit publisher pages for third-party JS assets and generate js-assets.toml entries using Playwright", + "author": { + "name": "StackPop" + }, + "license": "MIT", + "keywords": ["js-assets", "audit", "playwright", "ad-tech", "proxy"] +} diff --git a/packages/js-asset-auditor/bin/audit-js-assets b/packages/js-asset-auditor/bin/audit-js-assets new file mode 100755 index 00000000..cdff67ef --- /dev/null +++ b/packages/js-asset-auditor/bin/audit-js-assets @@ -0,0 +1,11 @@ +#!/usr/bin/env node + +// Plugin bin/ wrapper — resolves lib/audit.mjs relative to plugin root, +// not the user's working directory. + +import { fileURLToPath } from "node:url"; +import { dirname, resolve } from "node:path"; + +const pluginRoot = resolve(dirname(fileURLToPath(import.meta.url)), ".."); +const { main } = await import(resolve(pluginRoot, "lib/audit.mjs")); +main(); diff --git a/packages/js-asset-auditor/lib/audit.mjs b/packages/js-asset-auditor/lib/audit.mjs new file mode 100644 index 00000000..7334ece8 --- /dev/null +++ b/packages/js-asset-auditor/lib/audit.mjs @@ -0,0 +1,279 @@ +#!/usr/bin/env node + +// JS Asset Auditor CLI +// +// Standalone Playwright-based tool that sweeps a publisher page for third-party +// JS assets and generates js-assets.toml entries. +// +// Usage: +// node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com [options] +// audit-js-assets https://www.publisher.com [options] (when plugin bin/ is in PATH) +// +// Options: +// --diff Compare against existing js-assets.toml +// --settle Settle window after page load (default: 6000) +// --first-party Additional first-party hosts (comma-separated) +// --no-filter Bypass heuristic filtering +// --headed Run browser visibly for debugging +// --output Output file path (default: js-assets.toml) +// +// Prerequisites: +// cd packages/js-asset-auditor && npm install && npx playwright install chromium + +import { readFileSync, writeFileSync } from "node:fs"; +import { resolve } from "node:path"; +import { processAssets } from "./process.mjs"; + +// --------------------------------------------------------------------------- +// Config reading +// --------------------------------------------------------------------------- + +function readPublisherDomain(repoRoot) { + const content = readFileSync( + resolve(repoRoot, "trusted-server.toml"), + "utf-8", + ); + const lines = content.split("\n"); + let inPublisher = false; + for (const line of lines) { + if (/^\[publisher\]/.test(line)) { + inPublisher = true; + continue; + } + if (/^\[/.test(line)) { + inPublisher = false; + continue; + } + if (inPublisher) { + const m = line.match(/^domain\s*=\s*"([^"]+)"/); + if (m) return m[1]; + } + } + throw new Error( + "Could not find [publisher].domain in trusted-server.toml", + ); +} + +// --------------------------------------------------------------------------- +// CLI argument parsing +// --------------------------------------------------------------------------- + +function parseArgs(argv) { + const args = { + url: null, + domain: null, + diff: false, + settle: 6000, + firstParty: [], + noFilter: false, + headless: false, + output: "js-assets.toml", + config: null, + force: false, + }; + + for (let i = 2; i < argv.length; i++) { + const arg = argv[i]; + if (arg === "--domain") { + args.domain = argv[++i]; + } else if (arg === "--diff") { + args.diff = true; + } else if (arg === "--settle") { + args.settle = parseInt(argv[++i], 10); + } else if (arg === "--first-party") { + args.firstParty = argv[++i].split(",").filter(Boolean); + } else if (arg === "--no-filter") { + args.noFilter = true; + } else if (arg === "--headless") { + args.headless = true; + } else if (arg === "--output") { + args.output = argv[++i]; + } else if (arg === "--config") { + // --config with optional path: default to "trusted-server.toml" + const next = argv[i + 1]; + if (next && !next.startsWith("--")) { + args.config = argv[++i]; + } else { + args.config = "trusted-server.toml"; + } + } else if (arg === "--force") { + args.force = true; + } else if (!arg.startsWith("--") && !args.url) { + args.url = arg.startsWith("http") ? arg : `https://${arg}`; + } else { + console.error(`Unknown argument: ${arg}`); + process.exit(1); + } + } + + if (!args.url) { + console.error( + "Usage: audit-js-assets [--diff] [--settle ] [--first-party ] [--no-filter] [--headless] [--output ] [--config [path]] [--force]", + ); + process.exit(1); + } + + return args; +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +export async function main() { + const args = parseArgs(process.argv); + const repoRoot = process.cwd(); + + // Resolve publisher domain: --domain flag > trusted-server.toml > infer from URL + let domain = args.domain; + if (!domain) { + try { + domain = readPublisherDomain(repoRoot); + } catch { + // No config file — infer from target URL + try { + const host = new URL(args.url).hostname; + domain = host.startsWith("www.") ? host.slice(4) : host; + } catch { + domain = args.url; + } + console.error(`No trusted-server.toml found, using domain: ${domain}`); + } + } + + let chromium; + try { + ({ chromium } = await import("playwright")); + } catch { + console.error( + "Playwright not installed. Run:\n cd packages/js-asset-auditor && npm install", + ); + process.exit(1); + } + + console.error(`Launching browser...`); + let browser; + try { + browser = await chromium.launch({ headless: args.headless }); + } catch (err) { + if (err.message.includes("Executable doesn't exist")) { + console.error( + "Chromium not installed. Run:\n cd packages/js-asset-auditor && npx playwright install chromium", + ); + process.exit(1); + } + throw err; + } + + try { + const context = await browser.newContext(); + const page = await context.newPage(); + + const scriptUrls = []; + page.on("response", (response) => { + const req = response.request(); + if (req.resourceType() === "script") { + scriptUrls.push(req.url()); + } + }); + + console.error(`Navigating to ${args.url}...`); + await page.goto(args.url, { waitUntil: "load", timeout: 30000 }); + + console.error(`Waiting ${args.settle}ms for page to settle...`); + await page.waitForTimeout(args.settle); + + const headScriptUrls = await page.evaluate(() => + Array.from( + document.head.querySelectorAll("script[src]"), + ).map((s) => s.src), + ); + + console.error( + `Found ${scriptUrls.length} network scripts, ${headScriptUrls.length} head scripts`, + ); + + await browser.close(); + + console.error("Processing assets..."); + const result = processAssets( + { networkUrls: scriptUrls, headUrls: headScriptUrls }, + { + domain, + target: args.url, + output: args.output, + diff: args.diff, + firstParty: args.firstParty, + noFilter: args.noFilter, + }, + ); + + if (result.error) { + console.error(result.error); + process.exit(1); + } + + writeFileSync(args.output, result.toml); + const count = + result.summary.mode === "init" + ? result.summary.surfaced + : result.summary.new.length; + console.error(`Wrote ${args.output} (${count} entries)`); + + // Integration detection & config generation + if (args.config) { + const { detectIntegrations, generateConfig } = await import( + "./detect.mjs" + ); + const detection = detectIntegrations(scriptUrls); + + if (detection.integrations.length > 0) { + // Check if config file already exists + let fileExists = false; + try { + readFileSync(args.config); + fileExists = true; + } catch { + // File doesn't exist — safe to write + } + + if (fileExists && !args.force) { + console.error( + `${args.config} already exists. Use --force to overwrite.`, + ); + } else { + const configToml = generateConfig(domain, args.url, detection); + writeFileSync(args.config, configToml); + console.error( + `Wrote ${args.config} (${detection.integrations.length} integrations detected)`, + ); + } + } else { + console.error("No integrations detected — skipping config generation"); + } + + result.summary.integrations = detection.integrations.map((i) => ({ + id: i.id, + label: i.label, + category: i.category, + extracted: i.extracted, + todos: i.todos, + })); + } + + console.log(JSON.stringify(result.summary)); + } finally { + if (browser.isConnected()) { + await browser.close(); + } + } +} + +// Run when invoked directly +const isDirectExecution = + process.argv[1] && + new URL(process.argv[1], "file://").href === import.meta.url; + +if (isDirectExecution) { + main(); +} diff --git a/packages/js-asset-auditor/lib/detect.mjs b/packages/js-asset-auditor/lib/detect.mjs new file mode 100644 index 00000000..d2be4980 --- /dev/null +++ b/packages/js-asset-auditor/lib/detect.mjs @@ -0,0 +1,278 @@ +// JS Asset Auditor — Integration Detection & Config Generation +// +// Detects known integrations from raw script URLs captured during a page sweep, +// then generates a trusted-server.toml with appropriate [integrations.*] sections. +// +// Integration patterns are derived from the Rust source in +// crates/trusted-server-core/src/integrations/. + +// --------------------------------------------------------------------------- +// Integration pattern registry +// --------------------------------------------------------------------------- + +const PREBID_SUFFIXES = ["/prebid.js", "/prebid.min.js", "/prebidjs.js", "/prebidjs.min.js"]; + +const INTEGRATION_PATTERNS = [ + { + id: "gpt", + label: "Google Publisher Tags", + match: (url) => + url.hostname === "securepubads.g.doubleclick.net" && + url.pathname.startsWith("/tag/js/gpt"), + extract: (url) => ({ + script_url: `${url.origin}${url.pathname}`, + }), + defaults: { + cache_ttl_seconds: 3600, + rewrite_script: true, + }, + todos: [], + category: "full", + }, + { + id: "google_tag_manager", + label: "Google Tag Manager", + match: (url) => + url.hostname === "www.googletagmanager.com" && + url.pathname.includes("/gtm.js"), + extract: (url) => { + const containerId = url.searchParams.get("id"); + return containerId ? { container_id: containerId } : {}; + }, + defaults: {}, + todos: (extracted) => (extracted.container_id ? [] : ["container_id"]), + category: "partial", + }, + { + id: "didomi", + label: "Didomi Consent", + match: (url) => + url.hostname === "sdk.privacy-center.org" || + url.hostname === "api.privacy-center.org", + extract: () => ({}), + defaults: { + sdk_origin: "https://sdk.privacy-center.org", + api_origin: "https://api.privacy-center.org", + }, + todos: [], + category: "full", + }, + { + id: "datadome", + label: "DataDome Bot Protection", + match: (url) => + url.hostname === "js.datadome.co" || + url.hostname === "api-js.datadome.co", + extract: () => ({}), + defaults: { + sdk_origin: "https://js.datadome.co", + api_origin: "https://api-js.datadome.co", + cache_ttl_seconds: 3600, + rewrite_sdk: true, + }, + todos: [], + category: "full", + }, + { + id: "lockr", + label: "Lockr Identity", + match: (url) => { + const href = url.href.toLowerCase(); + return ( + (url.hostname.includes("aim.loc.kr") || + url.hostname.includes("identity.loc.kr")) && + href.includes("identity-lockr") && + href.endsWith(".js") + ); + }, + extract: (url) => ({ + sdk_url: url.href, + }), + defaults: { + api_endpoint: "https://identity.loc.kr", + cache_ttl_seconds: 3600, + rewrite_sdk: true, + }, + todos: ["app_id"], + category: "partial", + }, + { + id: "permutive", + label: "Permutive DMP", + match: (url) => + (url.hostname.endsWith(".edge.permutive.app") || + url.hostname === "cdn.permutive.com") && + url.pathname.endsWith("-web.js"), + extract: (url) => { + const result = {}; + // Extract organization_id from subdomain: {org}.edge.permutive.app + if (url.hostname.endsWith(".edge.permutive.app")) { + result.organization_id = url.hostname.replace(".edge.permutive.app", ""); + } + // Extract workspace_id from filename: /{workspace}-web.js + const filename = url.pathname.split("/").pop() || ""; + const wsMatch = filename.match(/^(.+)-web\.js$/); + if (wsMatch) { + result.workspace_id = wsMatch[1]; + } + return result; + }, + defaults: { + api_endpoint: "https://api.permutive.com", + secure_signals_endpoint: "https://secure-signals.permutive.app", + }, + todos: (extracted) => { + const missing = []; + if (!extracted.organization_id) missing.push("organization_id"); + if (!extracted.workspace_id) missing.push("workspace_id"); + return missing; + }, + category: "partial", + }, + { + id: "prebid", + label: "Prebid Header Bidding", + match: (url) => PREBID_SUFFIXES.some((s) => url.pathname.endsWith(s)), + extract: () => ({}), + defaults: { + timeout_ms: 1000, + debug: false, + }, + todos: ["server_url", "bidders"], + category: "detect_only", + }, + { + id: "aps", + label: "Amazon Publisher Services", + match: (url) => + url.hostname === "c.amazon-adsystem.com" && + url.pathname.includes("/apstag"), + extract: () => ({}), + defaults: { + endpoint: "https://aax.amazon-adsystem.com/e/dtb/bid", + timeout_ms: 1000, + }, + todos: ["pub_id"], + category: "detect_only", + }, +]; + +// --------------------------------------------------------------------------- +// Detection +// --------------------------------------------------------------------------- + +export function detectIntegrations(rawUrls) { + const detected = new Map(); + + for (const rawUrl of rawUrls) { + let url; + try { + url = new URL(rawUrl); + } catch { + continue; + } + + for (const pattern of INTEGRATION_PATTERNS) { + if (!pattern.match(url)) continue; + + if (detected.has(pattern.id)) { + // Merge: accumulate source URLs, fill in missing extracted fields + const existing = detected.get(pattern.id); + existing.sourceUrls.push(rawUrl); + const newExtracted = pattern.extract(url); + for (const [key, value] of Object.entries(newExtracted)) { + if (!(key in existing.extracted)) existing.extracted[key] = value; + } + } else { + const extracted = pattern.extract(url); + const todos = + typeof pattern.todos === "function" + ? pattern.todos(extracted) + : [...pattern.todos]; + + detected.set(pattern.id, { + id: pattern.id, + label: pattern.label, + category: pattern.category, + extracted, + defaults: { ...pattern.defaults }, + todos, + sourceUrls: [rawUrl], + }); + } + + break; // Each URL matches at most one integration + } + } + + // Recalculate dynamic todos after merging + for (const [id, entry] of detected) { + const pattern = INTEGRATION_PATTERNS.find((p) => p.id === id); + if (pattern && typeof pattern.todos === "function") { + entry.todos = pattern.todos(entry.extracted); + } + } + + return { + integrations: [...detected.values()], + }; +} + +// --------------------------------------------------------------------------- +// Config generation +// --------------------------------------------------------------------------- + +function formatTomlValue(value) { + if (typeof value === "string") return `"${value}"`; + if (typeof value === "boolean") return value ? "true" : "false"; + if (typeof value === "number") return String(value); + if (Array.isArray(value)) + return `[${value.map((v) => `"${v}"`).join(", ")}]`; + return String(value); +} + +export function generateConfig(domain, targetUrl, detectionResult) { + const today = new Date().toISOString().slice(0, 10); + let toml = ""; + + // Header + toml += `# Generated by js-asset-auditor on ${today}\n`; + toml += `# Source URL: ${targetUrl}\n`; + toml += `#\n`; + toml += `# Review all values before deploying. Fields marked TODO need manual input.\n`; + toml += `# Commented-out fields show defaults — uncomment to override.\n`; + toml += `\n`; + + // Publisher section + toml += `[publisher]\n`; + toml += `domain = "${domain}"\n`; + toml += `# cookie_domain = ".${domain}"\n`; + toml += `# origin_url = "https://origin.${domain}"\n`; + toml += `# proxy_secret = "change-me"\n`; + + // Integration sections + for (const integration of detectionResult.integrations) { + toml += `\n`; + toml += `[integrations.${integration.id}]\n`; + toml += `enabled = true\n`; + + // Auto-extracted fields + for (const [key, value] of Object.entries(integration.extracted)) { + toml += `${key} = ${formatTomlValue(value)} # auto-detected\n`; + } + + // TODO fields + for (const field of integration.todos) { + toml += `${field} = "" # TODO: set your ${integration.label} ${field}\n`; + } + + // Default fields (commented out) + for (const [key, value] of Object.entries(integration.defaults)) { + // Skip if already in extracted + if (key in integration.extracted) continue; + toml += `# ${key} = ${formatTomlValue(value)}\n`; + } + } + + return toml; +} diff --git a/packages/js-asset-auditor/lib/process.mjs b/packages/js-asset-auditor/lib/process.mjs new file mode 100644 index 00000000..623926fc --- /dev/null +++ b/packages/js-asset-auditor/lib/process.mjs @@ -0,0 +1,411 @@ +// JS Asset Auditor — Processing Library +// +// Pure processing functions for URL normalization, filtering, wildcard +// detection, slug generation, and TOML formatting. No external dependencies. +// +// The slug algorithm is duplicated from scripts/js-asset-slug.mjs. Both files +// must produce identical output. Any changes must be synchronized. + +import { createHash } from "node:crypto"; +import { posix } from "node:path"; +import { readFileSync } from "node:fs"; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +const BASE62_CHARSET = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +export const HEURISTIC_FILTERS = { + "Framework CDNs": ["cdnjs.cloudflare.com", "ajax.googleapis.com", "cdn.jsdelivr.net", "unpkg.com"], + "Error tracking": ["sentry.io", "bugsnag.com", "rollbar.com"], + "Font services": ["fonts.googleapis.com", "fonts.gstatic.com"], + "Social embeds": ["platform.twitter.com", "platform.x.com", "connect.facebook.net"], + "Google ad rendering": [ + "pagead2.googlesyndication.com", + "tpc.googlesyndication.com", + "s0.2mdn.net", + "googleads.g.doubleclick.net", + "www.googleadservices.com", + ], + "Ad fraud detection": ["adtrafficquality.google"], + "Ad verification": ["adsafeprotected.com", "moatads.com", "doubleverify.com"], + reCAPTCHA: [ + "recaptcha.net", + { host: "www.google.com", pathPrefix: "/recaptcha/" }, + { host: "www.gstatic.com", pathPrefix: "/recaptcha/" }, + ], +}; + +const SEMVER_RE = /^\d+\.\d+[\.\d\w-]*$/; +const HEX_HASH_RE = /^[a-f0-9]{8,}$/; +const MIXED_HASH_RE = /^[A-Za-z0-9]{8,}$/; + +// --------------------------------------------------------------------------- +// Slug generation +// --------------------------------------------------------------------------- + +function bufferToBase62(buffer) { + let num = 0n; + for (const byte of buffer) { + num = (num << 8n) | BigInt(byte); + } + if (num === 0n) return "0"; + const chars = []; + while (num > 0n) { + chars.push(BASE62_CHARSET[Number(num % 62n)]); + num = num / 62n; + } + return chars.reverse().join(""); +} + +export function extractAssetStem(originUrl) { + let pathname; + try { + pathname = new URL(originUrl).pathname; + } catch { + pathname = originUrl; + } + if (pathname.endsWith("/")) pathname = pathname.slice(0, -1); + const basename = posix.basename(pathname); + if (!basename || basename === "/") { + const segments = pathname.split("/").filter(Boolean); + const last = segments.at(-1) || "unknown"; + const dot = last.lastIndexOf("."); + return dot > 0 ? last.slice(0, dot) : last; + } + const dot = basename.lastIndexOf("."); + return dot > 0 ? basename.slice(0, dot) : basename; +} + +export function generateSlug(publisherDomain, originUrl) { + const input = `${publisherDomain}|${originUrl}`; + const digest = createHash("sha256").update(input).digest(); + const base62 = bufferToBase62(digest); + const publisherPrefix = base62.slice(0, 8); + const assetStem = extractAssetStem(originUrl); + return `${publisherPrefix}:${assetStem}`; +} + +// --------------------------------------------------------------------------- +// URL processing +// --------------------------------------------------------------------------- + +export function normalizeUrl(raw) { + let url = raw; + if (url.startsWith("//")) url = "https:" + url; + const hashIdx = url.indexOf("#"); + if (hashIdx !== -1) url = url.slice(0, hashIdx); + const qIdx = url.indexOf("?"); + if (qIdx !== -1) url = url.slice(0, qIdx); + if (url.endsWith("/")) url = url.slice(0, -1); + return url; +} + +function stripWww(host) { + return host.startsWith("www.") ? host.slice(4) : host; +} + +export function isFirstParty(hostname, publisherDomain, targetHost, extraHosts) { + const stripped = stripWww(hostname); + if (stripped === stripWww(publisherDomain)) return true; + if (stripped === stripWww(targetHost)) return true; + for (const h of extraHosts) { + if (stripped === stripWww(h)) return true; + } + return false; +} + +function dotBoundaryMatch(hostname, filterEntry) { + return hostname === filterEntry || hostname.endsWith("." + filterEntry); +} + +export function matchesHeuristicFilter(hostname, pathname) { + for (const [category, entries] of Object.entries(HEURISTIC_FILTERS)) { + for (const entry of entries) { + if (typeof entry === "string") { + if (dotBoundaryMatch(hostname, entry)) { + return { category, entry }; + } + } else { + if ( + dotBoundaryMatch(hostname, entry.host) && + pathname.startsWith(entry.pathPrefix) + ) { + return { category, entry: `${entry.host}${entry.pathPrefix}*` }; + } + } + } + } + return null; +} + +// --------------------------------------------------------------------------- +// Wildcard detection +// --------------------------------------------------------------------------- + +export function applyWildcards(url) { + let parsed; + try { + parsed = new URL(url); + } catch { + return { wildcarded: url, original: null, hasWildcard: false }; + } + const segments = parsed.pathname.split("/"); + let hasWildcard = false; + const newSegments = segments.map((seg) => { + if (!seg) return seg; + if (SEMVER_RE.test(seg)) { + hasWildcard = true; + return "*"; + } + if (HEX_HASH_RE.test(seg)) { + hasWildcard = true; + return "*"; + } + if ( + MIXED_HASH_RE.test(seg) && + /\d/.test(seg) && + /[a-zA-Z]/.test(seg) + ) { + hasWildcard = true; + return "*"; + } + return seg; + }); + const wildcarded = parsed.origin + newSegments.join("/"); + return { wildcarded, original: hasWildcard ? url : null, hasWildcard }; +} + +// --------------------------------------------------------------------------- +// TOML formatting +// --------------------------------------------------------------------------- + +export function formatTomlEntry(asset, commented = false) { + const pfx = commented ? "# " : ""; + let block = ""; + if (asset.hasWildcard && asset.originalUrl) { + block += `${pfx}# ${asset.originalUrl} (wildcard detected)\n`; + } + block += `${pfx}slug = "${asset.slug}"\n`; + block += `${pfx}path = "${asset.path}"\n`; + block += `${pfx}origin_url = "${asset.originUrl}"\n`; + block += `${pfx}inject_in_head = ${asset.injectInHead}\n`; + return block; +} + +export function shortenUrl(url) { + let parsed; + try { + parsed = new URL(url); + } catch { + return url; + } + const parts = parsed.pathname.split("/").filter(Boolean); + const filename = parts.at(-1) || parsed.pathname; + return `${parsed.hostname}/.../` + filename; +} + +// --------------------------------------------------------------------------- +// Diff mode: parse existing TOML +// --------------------------------------------------------------------------- + +export function parseExistingToml(content) { + const entries = []; + const blocks = content.split("[[js_assets]]"); + for (let i = 1; i < blocks.length; i++) { + const block = blocks[i]; + const originMatch = block.match(/^origin_url\s*=\s*"([^"]+)"/m); + const slugMatch = block.match(/^slug\s*=\s*"([^"]+)"/m); + if (originMatch) { + entries.push({ + originUrl: originMatch[1], + slug: slugMatch ? slugMatch[1] : "", + }); + } + } + return entries; +} + +// --------------------------------------------------------------------------- +// Core processing pipeline +// --------------------------------------------------------------------------- + +export function processAssets(input, args) { + const { networkUrls: rawNetworkUrls, headUrls: rawHeadUrls } = input; + + let targetHost = ""; + try { + targetHost = new URL(args.target).hostname; + } catch { + targetHost = args.target; + } + + const normalizedNetwork = [...new Set(rawNetworkUrls.map(normalizeUrl))]; + const normalizedHead = new Set(rawHeadUrls.map(normalizeUrl)); + + const firstPartyFiltered = []; + const thirdPartyUrls = []; + + for (const url of normalizedNetwork) { + let hostname; + try { + hostname = new URL(url).hostname; + } catch { + continue; + } + if (isFirstParty(hostname, args.domain, targetHost, args.firstParty || [])) { + firstPartyFiltered.push({ url, host: hostname }); + } else { + thirdPartyUrls.push(url); + } + } + + const heuristicFiltered = []; + const survivingUrls = []; + + for (const url of thirdPartyUrls) { + let hostname, pathname; + try { + const parsed = new URL(url); + hostname = parsed.hostname; + pathname = parsed.pathname; + } catch { + survivingUrls.push(url); + continue; + } + + if (args.noFilter) { + survivingUrls.push(url); + continue; + } + + const match = matchesHeuristicFilter(hostname, pathname); + if (match) { + heuristicFiltered.push({ url, host: hostname, ...match }); + } else { + survivingUrls.push(url); + } + } + + const filterCounts = {}; + for (const f of heuristicFiltered) { + filterCounts[f.host] = (filterCounts[f.host] || 0) + 1; + } + + const assets = []; + const seenOrigins = new Set(); + + for (const url of survivingUrls) { + const { wildcarded, original, hasWildcard } = applyWildcards(url); + + if (seenOrigins.has(wildcarded)) continue; + seenOrigins.add(wildcarded); + + const slug = generateSlug(args.domain, wildcarded); + const prefix = slug.split(":")[0]; + const injectInHead = normalizedHead.has(url); + + let path; + if (hasWildcard) { + path = `/js-assets/${prefix}/*`; + } else { + const stem = extractAssetStem(wildcarded); + path = `/js-assets/${prefix}/${stem}.js`; + } + + let hostname; + try { + hostname = new URL(url).hostname; + } catch { + hostname = "unknown"; + } + + assets.push({ + slug, + prefix, + path, + originUrl: wildcarded, + originalUrl: original, + injectInHead, + hasWildcard, + host: hostname, + shortUrl: shortenUrl(wildcarded), + }); + } + + const today = new Date().toISOString().slice(0, 10); + + if (args.diff) { + let existingContent; + try { + existingContent = readFileSync(args.output, "utf-8"); + } catch { + return { error: `Cannot read ${args.output} for diff mode` }; + } + + const existingEntries = parseExistingToml(existingContent); + const existingOrigins = new Set(existingEntries.map((e) => e.originUrl)); + const sweepOrigins = new Set(assets.map((a) => a.originUrl)); + + const confirmed = existingEntries.filter((e) => sweepOrigins.has(e.originUrl)); + const missing = existingEntries.filter((e) => !sweepOrigins.has(e.originUrl)); + const newAssets = assets.filter((a) => !existingOrigins.has(a.originUrl)); + + let appendBlock = ""; + if (newAssets.length > 0) { + appendBlock = `\n# --- NEW (detected by /audit-js-assets --diff on ${today}, uncomment to activate) ---\n`; + for (const a of newAssets) { + appendBlock += `\n# [[js_assets]]\n`; + appendBlock += formatTomlEntry(a, true); + } + } + + return { + toml: existingContent + appendBlock, + summary: { + mode: "diff", + publisherDomain: args.domain, + targetUrl: args.target, + confirmed: confirmed.map((e) => ({ slug: e.slug, originUrl: e.originUrl })), + new: newAssets.map((a) => ({ slug: a.slug, prefix: a.prefix, shortUrl: a.shortUrl, originUrl: a.originUrl })), + missing: missing.map((e) => ({ slug: e.slug, originUrl: e.originUrl })), + outputFile: args.output, + }, + }; + } + + let toml = `# Generated by /audit-js-assets on ${today}\n`; + toml += `# Publisher: ${args.domain}\n`; + toml += `# Source URL: ${args.target}\n`; + + for (const a of assets) { + toml += `\n[[js_assets]]\n`; + toml += formatTomlEntry(a); + } + + const filterSummary = Object.entries(filterCounts).map(([host, count]) => ({ host, count })); + + return { + toml, + summary: { + mode: "init", + publisherDomain: args.domain, + targetUrl: args.target, + totalDetected: thirdPartyUrls.length, + firstPartyFiltered: firstPartyFiltered.length, + firstPartyHost: targetHost, + heuristicFiltered: filterSummary, + heuristicFilteredTotal: heuristicFiltered.length, + surfaced: assets.length, + assets: assets.map((a) => ({ + prefix: a.prefix, + injectInHead: a.injectInHead, + shortUrl: a.shortUrl, + wildcard: a.hasWildcard, + })), + outputFile: args.output, + }, + }; +} diff --git a/packages/js-asset-auditor/lib/slug.mjs b/packages/js-asset-auditor/lib/slug.mjs new file mode 100644 index 00000000..fcc0c7c0 --- /dev/null +++ b/packages/js-asset-auditor/lib/slug.mjs @@ -0,0 +1,27 @@ +#!/usr/bin/env node + +// JS Asset Slug Generator +// +// Shared utility for generating deterministic slugs for js-assets.toml entries. +// Must produce identical output to the Rust proxy's KV key derivation. +// +// Algorithm: +// publisher_prefix = first_8_chars(base62(sha256(domain + "|" + url))) +// asset_stem = filename_without_extension(url) +// slug = "{publisher_prefix}:{asset_stem}" +// +// Usage: +// node packages/js-asset-auditor/lib/slug.mjs + +import { generateSlug } from "./process.mjs"; + +const [publisherDomain, originUrl] = process.argv.slice(2); + +if (!publisherDomain || !originUrl) { + console.error( + "Usage: node packages/js-asset-auditor/lib/slug.mjs ", + ); + process.exit(1); +} + +console.log(generateSlug(publisherDomain, originUrl)); diff --git a/packages/js-asset-auditor/package-lock.json b/packages/js-asset-auditor/package-lock.json new file mode 100644 index 00000000..080a7ad9 --- /dev/null +++ b/packages/js-asset-auditor/package-lock.json @@ -0,0 +1,62 @@ +{ + "name": "js-asset-auditor", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "js-asset-auditor", + "version": "1.0.0", + "dependencies": { + "playwright": "^1.58.0" + }, + "bin": { + "audit-js-assets": "bin/audit-js-assets" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/playwright": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz", + "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz", + "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + } + } +} diff --git a/packages/js-asset-auditor/package.json b/packages/js-asset-auditor/package.json new file mode 100644 index 00000000..044f4214 --- /dev/null +++ b/packages/js-asset-auditor/package.json @@ -0,0 +1,12 @@ +{ + "name": "js-asset-auditor", + "version": "1.0.0", + "private": true, + "type": "module", + "bin": { + "audit-js-assets": "./bin/audit-js-assets" + }, + "dependencies": { + "playwright": "^1.58.0" + } +} diff --git a/packages/js-asset-auditor/settings.json b/packages/js-asset-auditor/settings.json new file mode 100644 index 00000000..c17ba006 --- /dev/null +++ b/packages/js-asset-auditor/settings.json @@ -0,0 +1,7 @@ +{ + "permissions": { + "allow": [ + "Bash(audit-js-assets:*)" + ] + } +} diff --git a/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md b/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md new file mode 100644 index 00000000..8590f852 --- /dev/null +++ b/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md @@ -0,0 +1,79 @@ +--- +name: audit-js-assets +description: Audit a publisher page for third-party JavaScript assets. Use when analyzing external scripts, generating js-assets.toml entries, or monitoring changes to a publisher's JS footprint. +--- + +Audit a publisher page for third-party JS assets and generate `js-assets.toml` entries. + +Usage: /js-asset-auditor:audit-js-assets $ARGUMENTS + +`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...] [--no-filter] [--headless] [--config [path]] [--force]` + +--- + +Follow these steps exactly. Stop and report if any step fails. + +## 1. Run the auditor + +Run the Playwright CLI via Bash, forwarding all arguments from `$ARGUMENTS`: + +```bash +audit-js-assets $ARGUMENTS +``` + +The CLI reads `trusted-server.toml` for the publisher domain, opens a headless browser, collects script URLs, processes them, and writes `js-assets.toml`. Progress lines appear on stderr; a JSON summary prints to stdout. + +If the command fails with "Playwright not installed" or "Chromium not installed", tell the user to run: + +```bash +cd packages/js-asset-auditor && npm install && npx playwright install chromium +``` + +## 2. Show results + +Parse the JSON summary from stdout and print a formatted report. + +### Init mode + +``` +JS Asset Audit — {publisherDomain} +──────────────────────────────── +Detected: {totalDetected} third-party JS requests +Filtered: {heuristicFilteredTotal} ({host} x{count}, ...) +Surfaced: {surfaced} assets → js-assets.toml + + {prefix} inject_in_head={true|false} {shortUrl} + {prefix} inject_in_head={true|false} {shortUrl} [wildcard] + ... + +Review inject_in_head values and commit js-assets.toml when ready. +Diff mode: /js-asset-auditor:audit-js-assets --diff +``` + +### Diff mode + +``` +JS Asset Audit (diff) — {publisherDomain} +──────────────────────────────── +Confirmed: {confirmed.length} assets still present on page +New: {new.length} asset(s) detected (appended as comment to js-assets.toml) +Missing: {missing.length} asset(s) no longer seen on page ⚠ + + NEW {prefix} {shortUrl} → review in js-assets.toml + MISSING {slug} {originUrl} → may have been removed or renamed +``` + +### Integration detection (when --config is used) + +If the JSON summary includes an `integrations` array, append: + +``` +Detected Integrations: + {id} ✓ fully configured + {id} ✓ {field}={value} (auto-detected) + {id} ⚠ {field} needs manual input + +→ {config path} generated with {count} integrations +``` + +Use ✓ for `full` category and integrations with no TODOs. Use ⚠ for integrations with TODO fields. diff --git a/scripts/js-asset-slug.mjs b/scripts/js-asset-slug.mjs new file mode 100755 index 00000000..1169abe9 --- /dev/null +++ b/scripts/js-asset-slug.mjs @@ -0,0 +1,89 @@ +#!/usr/bin/env node + +// JS Asset Slug Generator +// +// Shared utility for generating deterministic slugs for js-assets.toml entries. +// Used by the /audit-js-assets command and must produce identical output to the +// Rust proxy's KV key derivation. +// +// Algorithm: +// publisher_prefix = first_8_chars(base62(sha256(domain + "|" + url))) +// asset_stem = filename_without_extension(url) +// slug = "{publisher_prefix}:{asset_stem}" +// +// base62 charset: 0-9A-Za-z (digits first, then uppercase, then lowercase) +// +// Usage: +// node scripts/js-asset-slug.mjs +// node scripts/js-asset-slug.mjs test-publisher.com https://vendor.io/sdk/loader.js +// # Output: <8-char-prefix>:loader + +import { createHash } from "node:crypto"; +import { posix } from "node:path"; + +const BASE62_CHARSET = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +function bufferToBase62(buffer) { + // Treat the buffer as a big-endian unsigned integer and convert to base62. + let num = 0n; + for (const byte of buffer) { + num = (num << 8n) | BigInt(byte); + } + + if (num === 0n) return "0"; + + const chars = []; + while (num > 0n) { + chars.push(BASE62_CHARSET[Number(num % 62n)]); + num = num / 62n; + } + + return chars.reverse().join(""); +} + +function extractAssetStem(originUrl) { + let pathname; + try { + pathname = new URL(originUrl).pathname; + } catch { + pathname = originUrl; + } + + // Remove trailing slash + if (pathname.endsWith("/")) { + pathname = pathname.slice(0, -1); + } + + const basename = posix.basename(pathname); + if (!basename || basename === "/") { + // Fallback: use last non-empty path segment + const segments = pathname.split("/").filter(Boolean); + const last = segments.at(-1) || "unknown"; + const dot = last.lastIndexOf("."); + return dot > 0 ? last.slice(0, dot) : last; + } + + const dot = basename.lastIndexOf("."); + return dot > 0 ? basename.slice(0, dot) : basename; +} + +function generateSlug(publisherDomain, originUrl) { + const input = `${publisherDomain}|${originUrl}`; + const digest = createHash("sha256").update(input).digest(); + const base62 = bufferToBase62(digest); + const publisherPrefix = base62.slice(0, 8); + const assetStem = extractAssetStem(originUrl); + return `${publisherPrefix}:${assetStem}`; +} + +const [publisherDomain, originUrl] = process.argv.slice(2); + +if (!publisherDomain || !originUrl) { + console.error( + "Usage: node scripts/js-asset-slug.mjs ", + ); + process.exit(1); +} + +console.log(generateSlug(publisherDomain, originUrl));