From 89fab0bd4fb619662379eb7734f54b83717d94d6 Mon Sep 17 00:00:00 2001 From: jevansnyc Date: Wed, 1 Apr 2026 13:17:15 -0500 Subject: [PATCH 01/12] Add JS Asset Auditor engineering spec Engineering spec for the /audit-js-assets . Covers sweep protocol, Chrome DevTools MCP tooling, heuristic filtering, slug generation, init and diff modes. Closes #606 --- .../2026-04-01-js-asset-auditor-design.md | 216 ++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md diff --git a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md new file mode 100644 index 00000000..d6168592 --- /dev/null +++ b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md @@ -0,0 +1,216 @@ +# JS Asset Auditor — Engineering Spec + +**Date:** 2026-04-01 +**Status:** Approved for engineering breakdown +**Related:** [JS Asset Proxy spec](2026-04-01-js-asset-proxy-design.md) + +--- + +## Context + +The JS Asset Proxy requires a `js-assets.toml` file declaring which third-party JS assets to proxy. Without tooling, populating this file requires manually inspecting network requests in browser DevTools, extracting URLs, generating opaque slugs, and writing TOML — a tedious error-prone process that is a barrier to publisher onboarding. + +The Auditor eliminates this friction. It sweeps a publisher's page using the Chrome DevTools MCP, detects third-party JS assets, auto-generates `js-assets.toml` entries, and auto-detects `inject_in_head` from the page DOM. The operator's only remaining decision is reviewing the output before committing. + +It also runs as a monitoring tool — `--diff` mode compares a new sweep against the existing config and surfaces new or removed assets, giving publishers ongoing visibility into their third-party JS footprint. + +**Implementation:** Pure Claude Code skill — no Rust, no compiled code, no additional dependencies. Uses the Chrome DevTools MCP already configured in `.claude/settings.json`. + +--- + +## Command Interface + +```bash +/audit-js-assets https://www.publisher.com # init — generate js-assets.toml +/audit-js-assets https://www.publisher.com --diff # diff — compare against existing file +``` + +--- + +## Sweep Protocol + +1. Read `trusted-server.toml` → extract `publisher.domain` (defines first-party boundary) +2. Open Chrome via `mcp__chrome-devtools__new_page`, navigate to target URL via `mcp__chrome-devtools__navigate_page` +3. Wait for full page load + ~6s settle window for async script loads (`mcp__chrome-devtools__wait_for`) +4. In parallel: + - `mcp__chrome-devtools__list_network_requests` → filter for requests where URL ends in `.js` or `Content-Type: application/javascript`, and origin ≠ `publisher.domain` + - `mcp__chrome-devtools__evaluate_script` → `Array.from(document.head.querySelectorAll('script[src]')).map(s => s.src)` → collect head-loaded script URLs +5. Apply heuristic filter (see below) +6. For each surviving asset, generate a `[[js_assets]]` entry (see below) +7. Write output (init or diff mode) +8. Print terminal summary +9. Close page via `mcp__chrome-devtools__close_page` + +--- + +## Heuristic Filter + +The following origin categories are excluded silently. The terminal summary reports what was filtered and why so operators can manually add entries if needed. + +| Category | Excluded origins | +|---|---| +| Framework CDNs | `cdnjs.cloudflare.com`, `ajax.googleapis.com`, `cdn.jsdelivr.net`, `unpkg.com` | +| Error tracking | `sentry.io`, `bugsnag.com`, `rollbar.com` | +| Font services | `fonts.googleapis.com`, `fonts.gstatic.com` | +| Social embeds | `platform.twitter.com`, `connect.facebook.net` | + +**`googletagmanager.com` is not filtered** — GTM is ad tech and should be proxied. + +Everything else surfaces for operator review. + +--- + +## Asset Entry Generation + +| Field | Derivation | +|---|---| +| `slug` | `{publisher_prefix}:{asset_stem}` — see slug algorithm below | +| `path` | `/{publisher_prefix}/{asset_stem}.js`, or wildcard variant if versioned path detected | +| `origin_url` | Full captured URL, with wildcard substitution applied if versioned | +| `ttl_sec` | Omitted — proxy defaults to 1800 (wildcard) or 3600 (fixed) | +| `inject_in_head` | `true` if URL appeared in head script list from DOM evaluation, else `false` | + +### Slug algorithm + +``` +publisher_prefix = first_8_chars(base62(sha256(publisher.domain + origin_url))) +asset_stem = filename_without_extension(origin_url) +slug = "{publisher_prefix}:{asset_stem}" +``` + +**Rationale:** Fully opaque and hash-derived — no human naming required, no ambiguity for cryptic vendor filenames. The KV metadata (`origin_url`, `content_type`, `asset_slug`) serves as the lookup table. Operators can query `js-asset:{slug}` in the KV store to retrieve full provenance. The terminal summary also prints slug → origin_url at generation time. + +**Important:** This algorithm must produce identical output to the Proxy's KV key derivation. Engineering should implement this as a shared utility (e.g., a small JS/TS helper in the skill, or a standalone `scripts/` utility) rather than duplicating the logic. + +### Wildcard detection + +Path segments matching either pattern are replaced with `*`: +- Semver: `\d+\.\d+[\.\d-]*` (e.g., `1.19.8-hcskhn`) +- Hash-like: `[a-f0-9]{6,}` or `[A-Za-z0-9]{8,}` between path separators + +The original URL is preserved as a comment above the generated entry so operators can verify the wildcard substitution is correct. + +--- + +## Init Mode Output + +### `js-assets.toml` (written to repo root) + +```toml +# Generated by /audit-js-assets on 2026-04-01 +# Publisher: publisher.com +# Source URL: https://www.publisher.com + +[[js_assets]] +# https://web.prebidwrapper.com/golf-WnLmpLyEjL/default-v2/prebid-load.js +slug = "aB3kR7mN:prebid-load" +path = "/sdk/aB3kR7mN.js" +origin_url = "https://web.prebidwrapper.com/golf-WnLmpLyEjL/default-v2/prebid-load.js" +inject_in_head = true + +[[js_assets]] +# https://raven-static.vendor.io/prod/1.19.8-hcskhn/raven.js (wildcard detected) +slug = "xQ9pL2wY:raven" +path = "/raven-static/*" +origin_url = "https://raven-static.vendor.io/prod/*/raven.js" +inject_in_head = false +``` + +### Terminal summary + +``` +JS Asset Audit — publisher.com +──────────────────────────────── +Detected: 8 third-party JS requests +Filtered: 3 (cdnjs.cloudflare.com ×2, sentry.io ×1) +Surfaced: 5 assets → js-assets.toml + + aB3kR7mN inject_in_head=true web.prebidwrapper.com/.../prebid-load.js + xQ9pL2wY inject_in_head=false raven-static.vendor.io/prod/*/raven.js [wildcard] + zM4nK8vP inject_in_head=true googletagmanager.com/gtm.js + ... + +Review inject_in_head values and commit js-assets.toml when ready. +Diff mode: /audit-js-assets --diff +``` + +--- + +## Diff Mode Output + +Compares sweep results against the existing `js-assets.toml`. + +| Condition | Behavior | +|---|---| +| Asset in sweep, not in file | **New** — appended to `js-assets.toml` as a commented-out block | +| Asset in file, not in sweep | **Missing** — flagged in terminal summary with `⚠`. Never auto-removed. | +| Asset in both | **Confirmed** — listed as present | + +New entries are appended as TOML comments so the file stays valid and nothing is activated without the operator explicitly uncommenting. + +### `js-assets.toml` (new entry appended as comment) + +```toml +# --- NEW (detected by /audit-js-assets --diff on 2026-04-01, uncomment to activate) --- +# [[js_assets]] +# # https://googletagmanager.com/gtm.js +# slug = "zM4nK8vP:gtm" +# path = "/sdk/zM4nK8vP.js" +# origin_url = "https://googletagmanager.com/gtm.js" +# inject_in_head = true +``` + +### Terminal summary (diff mode) + +``` +JS Asset Audit (diff) — publisher.com +──────────────────────────────── +Confirmed: 4 assets still present on page +New: 1 asset detected (appended as comment to js-assets.toml) +Missing: 1 asset no longer seen on page ⚠ + + NEW zM4nK8vP googletagmanager.com/gtm.js → review in js-assets.toml + MISSING xQ9pL2wY raven-static.vendor.io/... → may have been removed or renamed +``` + +--- + +## Implementation + +The Auditor is a Claude Code skill file. No compiled code. + +**Skill location:** `.claude/skills/audit-js-assets.md` + +**MCP tools used:** +- `mcp__chrome-devtools__new_page` — open browser tab +- `mcp__chrome-devtools__navigate_page` — load publisher URL +- `mcp__chrome-devtools__wait_for` — settle after page load +- `mcp__chrome-devtools__list_network_requests` — capture JS requests +- `mcp__chrome-devtools__evaluate_script` — detect head-loaded scripts via DOM query +- `mcp__chrome-devtools__close_page` — clean up tab + +**File tools used:** +- `Read` — read `trusted-server.toml` (publisher domain) and existing `js-assets.toml` (diff mode) +- `Write` — write generated/updated `js-assets.toml` + +--- + +## Delivery Order + +The Auditor should be delivered **after Proxy Phase 1** (so `js-assets.toml` schema is defined) and **before Proxy Phase 2** (so engineering has real populated entries to test the cache pipeline against actual vendor origins). + +See [delivery order in the Proxy spec](2026-04-01-js-asset-proxy-design.md). + +--- + +## Verification + +- Run `/audit-js-assets https://www.publisher.com` against a known test publisher page with identified third-party JS +- Verify generated entries match actual third-party JS observed on the page (cross-check in browser DevTools) +- Verify `inject_in_head = true` only for scripts that appear in `` (not ``) +- Verify wildcard detection fires for versioned path segments and not for stable paths +- Verify GTM (`googletagmanager.com`) is captured and not filtered +- Verify framework CDNs (`cdnjs.cloudflare.com` etc.) are filtered with reason in summary +- Run `--diff` against an unchanged page → all entries confirmed, no new/missing +- Run `--diff` after adding a new vendor script to the page → appears as `NEW` in summary +- Run `--diff` after removing a script → appears as `MISSING ⚠` in summary, file unchanged From d8a0d84c914261ecd3d6ffd1d4c95369b4a2de86 Mon Sep 17 00:00:00 2001 From: Christian Date: Fri, 10 Apr 2026 15:46:42 -0500 Subject: [PATCH 02/12] Address PR feedback on JS Asset Auditor spec Fix incorrect MCP tool name prefix, replace misused wait_for with evaluate_script setTimeout, correct list_network_requests filtering to use resourceTypes, resolve path derivation contradiction with consistent /js-assets/{prefix}/{stem}.js formula, pin slug separator and base62 charset, add URL Processing section with normalization rules and first-party boundary definition, tighten wildcard regex to require mixed character classes, and move skill location to .claude/commands/. --- .../2026-04-01-js-asset-auditor-design.md | 113 ++++++++++++------ 1 file changed, 74 insertions(+), 39 deletions(-) diff --git a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md index d6168592..aae3db57 100644 --- a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md +++ b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md @@ -2,7 +2,7 @@ **Date:** 2026-04-01 **Status:** Approved for engineering breakdown -**Related:** [JS Asset Proxy spec](2026-04-01-js-asset-proxy-design.md) +**Related:** [JS Asset Proxy spec](2026-04-01-js-asset-proxy-design.md) _(on `js-asset-proxy-spec` branch until merged)_ --- @@ -21,8 +21,9 @@ It also runs as a monitoring tool — `--diff` mode compares a new sweep against ## Command Interface ```bash -/audit-js-assets https://www.publisher.com # init — generate js-assets.toml -/audit-js-assets https://www.publisher.com --diff # diff — compare against existing file +/audit-js-assets https://www.publisher.com # init — generate js-assets.toml +/audit-js-assets https://www.publisher.com --diff # diff — compare against existing file +/audit-js-assets https://www.publisher.com --settle 15000 # longer settle for ad-tech-heavy pages ``` --- @@ -30,16 +31,38 @@ It also runs as a monitoring tool — `--diff` mode compares a new sweep against ## Sweep Protocol 1. Read `trusted-server.toml` → extract `publisher.domain` (defines first-party boundary) -2. Open Chrome via `mcp__chrome-devtools__new_page`, navigate to target URL via `mcp__chrome-devtools__navigate_page` -3. Wait for full page load + ~6s settle window for async script loads (`mcp__chrome-devtools__wait_for`) +2. Open Chrome via `mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page`, navigate to target URL via `mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page` +3. Wait for page load settle: `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` with `await new Promise(r => setTimeout(r, SETTLE_MS))` where `SETTLE_MS` defaults to 6000 (configurable via `--settle `) 4. In parallel: - - `mcp__chrome-devtools__list_network_requests` → filter for requests where URL ends in `.js` or `Content-Type: application/javascript`, and origin ≠ `publisher.domain` - - `mcp__chrome-devtools__evaluate_script` → `Array.from(document.head.querySelectorAll('script[src]')).map(s => s.src)` → collect head-loaded script URLs -5. Apply heuristic filter (see below) + - `mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests` with `resourceTypes: ["script"]` → post-filter to exclude first-party hosts (see URL Processing below) + - `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` → `Array.from(document.head.querySelectorAll('script[src]')).map(s => s.src)` → collect head-loaded script URLs +5. Apply URL normalization (see below), then heuristic filter (see below) 6. For each surviving asset, generate a `[[js_assets]]` entry (see below) 7. Write output (init or diff mode) 8. Print terminal summary -9. Close page via `mcp__chrome-devtools__close_page` +9. Close page via `mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page` + +**`inject_in_head` semantics:** The DOM snapshot in step 4 captures the final state of `` after the settle window. Scripts that were briefly inserted and then removed by a loader will not appear. This is intentional — `inject_in_head = true` means "the script is present in `` at page-stable state." If a loader removes it before the snapshot, the proxy should not re-inject it. + +--- + +## URL Processing + +### First-party boundary + +A network request is **first-party** if the request URL's host, after stripping a leading `www.`, matches `publisher.domain` (from `trusted-server.toml`) after the same stripping. Matching is exact on the resulting strings. + +Publisher-owned CDN subdomains (e.g., `cdn.publisher.com`, `static.publisher.com`) are treated as third-party by default. If the publisher wants to exclude them, they can be added to a `first_party_hosts` list in the command invocation (e.g., `--first-party cdn.publisher.com`). + +### URL normalization + +Applied to every captured script URL before slug generation and before persisting `origin_url`: + +1. Strip fragment (`#...`) +2. Strip all query parameters — cache-busters (`?v=123`, `?cb=timestamp`), consent params, and session tokens all live in query strings. JS asset versioning uses path segments, not query params. +3. Strip trailing slash from the path + +The normalized URL is what gets stored in `origin_url` and fed into the slug hash. --- @@ -47,12 +70,14 @@ It also runs as a monitoring tool — `--diff` mode compares a new sweep against The following origin categories are excluded silently. The terminal summary reports what was filtered and why so operators can manually add entries if needed. -| Category | Excluded origins | -|---|---| +**Matching:** Filter entries match if the request URL's host ends with the filter entry, with a dot-boundary check. For example, `googletagmanager.com` in the filter matches `www.googletagmanager.com` but not `evil-googletagmanager.com`. + +| Category | Excluded origins | +| -------------- | ------------------------------------------------------------------------------ | | Framework CDNs | `cdnjs.cloudflare.com`, `ajax.googleapis.com`, `cdn.jsdelivr.net`, `unpkg.com` | -| Error tracking | `sentry.io`, `bugsnag.com`, `rollbar.com` | -| Font services | `fonts.googleapis.com`, `fonts.gstatic.com` | -| Social embeds | `platform.twitter.com`, `connect.facebook.net` | +| Error tracking | `sentry.io`, `bugsnag.com`, `rollbar.com` | +| Font services | `fonts.googleapis.com`, `fonts.gstatic.com` | +| Social embeds | `platform.twitter.com`, `platform.x.com`, `connect.facebook.net` | **`googletagmanager.com` is not filtered** — GTM is ad tech and should be proxied. @@ -62,31 +87,38 @@ Everything else surfaces for operator review. ## Asset Entry Generation -| Field | Derivation | -|---|---| -| `slug` | `{publisher_prefix}:{asset_stem}` — see slug algorithm below | -| `path` | `/{publisher_prefix}/{asset_stem}.js`, or wildcard variant if versioned path detected | -| `origin_url` | Full captured URL, with wildcard substitution applied if versioned | -| `ttl_sec` | Omitted — proxy defaults to 1800 (wildcard) or 3600 (fixed) | -| `inject_in_head` | `true` if URL appeared in head script list from DOM evaluation, else `false` | +| Field | Derivation | +| ---------------- | --------------------------------------------------------------------------------------------------- | +| `slug` | `{publisher_prefix}:{asset_stem}` — see slug algorithm below | +| `path` | Fixed: `/js-assets/{publisher_prefix}/{asset_stem}.js`. Wildcard: `/js-assets/{publisher_prefix}/*` | +| `origin_url` | Normalized URL (see URL Processing), with wildcard substitution applied if versioned | +| `ttl_sec` | Omitted — proxy defaults to 1800 (wildcard) or 3600 (fixed) | +| `stale_ttl_sec` | Omitted — proxy defaults to 86400 (24h) | +| `inject_in_head` | `true` if URL appeared in head script list from DOM evaluation, else `false` | ### Slug algorithm ``` -publisher_prefix = first_8_chars(base62(sha256(publisher.domain + origin_url))) +publisher_prefix = first_8_chars(base62(sha256(publisher.domain + "|" + origin_url))) asset_stem = filename_without_extension(origin_url) slug = "{publisher_prefix}:{asset_stem}" ``` +The pipe (`|`) separator is required — it cannot appear in domain names or at the start of a URL, so the hash input is unambiguous. The `origin_url` fed into the hash must be the normalized URL (see URL Processing). + +**base62 charset:** `0-9A-Za-z` (digits first, then uppercase, then lowercase). This matches the `base62` crate convention. + **Rationale:** Fully opaque and hash-derived — no human naming required, no ambiguity for cryptic vendor filenames. The KV metadata (`origin_url`, `content_type`, `asset_slug`) serves as the lookup table. Operators can query `js-asset:{slug}` in the KV store to retrieve full provenance. The terminal summary also prints slug → origin_url at generation time. **Important:** This algorithm must produce identical output to the Proxy's KV key derivation. Engineering should implement this as a shared utility (e.g., a small JS/TS helper in the skill, or a standalone `scripts/` utility) rather than duplicating the logic. ### Wildcard detection -Path segments matching either pattern are replaced with `*`: +Path segments matching any of these patterns are replaced with `*`: + - Semver: `\d+\.\d+[\.\d-]*` (e.g., `1.19.8-hcskhn`) -- Hash-like: `[a-f0-9]{6,}` or `[A-Za-z0-9]{8,}` between path separators +- Hex hash: `[a-f0-9]{8,}` between path separators (lowercase hex, minimum 8 characters) +- Mixed alphanumeric hash: `[A-Za-z0-9]{8,}` between path separators, **must contain at least one digit and at least one letter** — this excludes pure-alpha dictionary words like `analytics` or `bootstrap` The original URL is preserved as a comment above the generated entry so operators can verify the wildcard substitution is correct. @@ -104,14 +136,14 @@ The original URL is preserved as a comment above the generated entry so operator [[js_assets]] # https://web.prebidwrapper.com/golf-WnLmpLyEjL/default-v2/prebid-load.js slug = "aB3kR7mN:prebid-load" -path = "/sdk/aB3kR7mN.js" +path = "/js-assets/aB3kR7mN/prebid-load.js" origin_url = "https://web.prebidwrapper.com/golf-WnLmpLyEjL/default-v2/prebid-load.js" inject_in_head = true [[js_assets]] # https://raven-static.vendor.io/prod/1.19.8-hcskhn/raven.js (wildcard detected) slug = "xQ9pL2wY:raven" -path = "/raven-static/*" +path = "/js-assets/xQ9pL2wY/*" origin_url = "https://raven-static.vendor.io/prod/*/raven.js" inject_in_head = false ``` @@ -140,11 +172,11 @@ Diff mode: /audit-js-assets --diff Compares sweep results against the existing `js-assets.toml`. -| Condition | Behavior | -|---|---| -| Asset in sweep, not in file | **New** — appended to `js-assets.toml` as a commented-out block | +| Condition | Behavior | +| --------------------------- | ----------------------------------------------------------------------- | +| Asset in sweep, not in file | **New** — appended to `js-assets.toml` as a commented-out block | | Asset in file, not in sweep | **Missing** — flagged in terminal summary with `⚠`. Never auto-removed. | -| Asset in both | **Confirmed** — listed as present | +| Asset in both | **Confirmed** — listed as present | New entries are appended as TOML comments so the file stays valid and nothing is activated without the operator explicitly uncommenting. @@ -155,7 +187,7 @@ New entries are appended as TOML comments so the file stays valid and nothing is # [[js_assets]] # # https://googletagmanager.com/gtm.js # slug = "zM4nK8vP:gtm" -# path = "/sdk/zM4nK8vP.js" +# path = "/js-assets/zM4nK8vP/gtm.js" # origin_url = "https://googletagmanager.com/gtm.js" # inject_in_head = true ``` @@ -179,17 +211,20 @@ Missing: 1 asset no longer seen on page ⚠ The Auditor is a Claude Code skill file. No compiled code. -**Skill location:** `.claude/skills/audit-js-assets.md` +**Skill location:** `.claude/commands/audit-js-assets.md` **MCP tools used:** -- `mcp__chrome-devtools__new_page` — open browser tab -- `mcp__chrome-devtools__navigate_page` — load publisher URL -- `mcp__chrome-devtools__wait_for` — settle after page load -- `mcp__chrome-devtools__list_network_requests` — capture JS requests -- `mcp__chrome-devtools__evaluate_script` — detect head-loaded scripts via DOM query -- `mcp__chrome-devtools__close_page` — clean up tab + +- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page` — open browser tab +- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page` — load publisher URL +- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests` — capture JS requests +- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` — settle window + detect head-loaded scripts via DOM query +- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page` — clean up tab + +**Permission grants required:** `navigate_page`, `list_network_requests`, and `close_page` are not currently approved in `.claude/settings.json`. Add them to `permissions.allow` before running the skill, or expect interactive permission prompts on first run. **File tools used:** + - `Read` — read `trusted-server.toml` (publisher domain) and existing `js-assets.toml` (diff mode) - `Write` — write generated/updated `js-assets.toml` @@ -199,7 +234,7 @@ The Auditor is a Claude Code skill file. No compiled code. The Auditor should be delivered **after Proxy Phase 1** (so `js-assets.toml` schema is defined) and **before Proxy Phase 2** (so engineering has real populated entries to test the cache pipeline against actual vendor origins). -See [delivery order in the Proxy spec](2026-04-01-js-asset-proxy-design.md). +See [delivery order in the Proxy spec](2026-04-01-js-asset-proxy-design.md) _(on `js-asset-proxy-spec` branch until merged)_. --- From ee6ec587213a38375d39054fb3941f5d29f8bae2 Mon Sep 17 00:00:00 2001 From: Christian Date: Fri, 10 Apr 2026 16:03:50 -0500 Subject: [PATCH 03/12] Add JS Asset Auditor command and slug generation utility Implement the /audit-js-assets command that sweeps a publisher page via Chrome DevTools MCP, detects third-party JS assets, and generates js-assets.toml entries. Includes a shared slug generation script (SHA-256 + base62) and adds MCP permission grants for navigate_page, list_network_requests, and close_page. --- .claude/commands/audit-js-assets.md | 198 ++++++++++++++++++++++++++++ .claude/settings.json | 6 +- scripts/js-asset-slug.mjs | 89 +++++++++++++ 3 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 .claude/commands/audit-js-assets.md create mode 100755 scripts/js-asset-slug.mjs diff --git a/.claude/commands/audit-js-assets.md b/.claude/commands/audit-js-assets.md new file mode 100644 index 00000000..57b5a5e8 --- /dev/null +++ b/.claude/commands/audit-js-assets.md @@ -0,0 +1,198 @@ +Audit a publisher page for third-party JS assets and generate `js-assets.toml` entries. + +Usage: /audit-js-assets $ARGUMENTS + +`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...]` + +- `` — publisher page URL (required) +- `--diff` — compare sweep against existing `js-assets.toml` instead of generating from scratch +- `--settle ` — settle window in milliseconds after page load (default: 6000) +- `--first-party ,...` — additional hosts to treat as first-party (comma-separated) + +--- + +Follow these steps exactly. Stop and report if any step fails. + +## 1. Parse arguments + +Extract the URL from `$ARGUMENTS` (required — error if missing). Parse optional flags: `--diff` (boolean), `--settle ` (integer, default 6000), `--first-party ,...` (comma-separated list). + +## 2. Read publisher config + +Use the `Read` tool on `trusted-server.toml` in the repo root. Extract the `domain` value from the `[publisher]` section. Error if the file is missing or `[publisher].domain` is not found. + +## 3. Open browser and navigate + +1. Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page` to open a new browser tab +2. Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page` with the target URL +3. If navigation fails, close the page and report the error + +## 4. Wait for page settle + +Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` with: + +```js +await new Promise(r => setTimeout(r, SETTLE_MS)) +``` + +Replace `SETTLE_MS` with the `--settle` value (default 6000). + +## 5. Collect data + +Make these two calls in parallel: + +**Network requests:** +Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests` with `resourceTypes: ["script"]`. Save the full list of script URLs. + +**Head scripts:** +Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` with: + +```js +Array.from(document.head.querySelectorAll('script[src]')).map(s => s.src) +``` + +Save the resulting array — this determines `inject_in_head` later. + +## 6. URL normalization + +For each captured script URL, normalize it: + +1. Strip the fragment (`#` and everything after) +2. Strip all query parameters (`?` and everything after) +3. Strip trailing slash from the path + +Use the **normalized** URL for all subsequent steps (filtering, slug generation, `origin_url` output). + +## 7. First-party filtering + +For each normalized URL, parse the hostname. Strip a leading `www.` from both the URL's host and `publisher.domain`. If they match exactly, exclude the URL. Also exclude URLs whose host (after `www.` stripping) matches any `--first-party` host. + +Count and track excluded URLs — they don't appear in output but don't appear in the filtered summary either. + +## 8. Heuristic filtering + +Exclude URLs whose host matches any entry below using **dot-boundary suffix matching**: the URL's host must either equal the filter entry or end with `.` + the filter entry. For example, `sentry.io` matches `sentry.io` and `o123.ingest.sentry.io` but not `notsentry.io`. + +| Category | Excluded hosts | +|---|---| +| Framework CDNs | `cdnjs.cloudflare.com`, `ajax.googleapis.com`, `cdn.jsdelivr.net`, `unpkg.com` | +| Error tracking | `sentry.io`, `bugsnag.com`, `rollbar.com` | +| Font services | `fonts.googleapis.com`, `fonts.gstatic.com` | +| Social embeds | `platform.twitter.com`, `platform.x.com`, `connect.facebook.net` | + +**`googletagmanager.com` is NOT filtered** — GTM is ad tech and should be proxied. + +Track each filtered URL with its category and host for the terminal summary. + +## 9. Wildcard detection + +For each surviving URL, check each path segment (split by `/`) against these patterns. Replace matching segments with `*`: + +- **Semver:** `/^\d+\.\d+[\.\d-]*$/` (e.g., `1.19.8-hcskhn`) +- **Hex hash:** `/^[a-f0-9]{8,}$/` (lowercase hex, 8+ chars) +- **Mixed alphanumeric hash:** `/^[A-Za-z0-9]{8,}$/` AND the segment must contain at least one digit AND at least one letter (excludes dictionary words like `analytics`) + +If any segment was wildcarded, save the **original** URL (before substitution) as a comment for the TOML entry. + +## 10. Slug generation + +For each surviving asset, generate a slug by running: + +```bash +node scripts/js-asset-slug.mjs "" "" +``` + +The output is the full slug (e.g., `ZSZksDbq:prebid-load`). Extract the part before `:` as the `publisher_prefix` for the path field. + +## 11. Determine `inject_in_head` + +For each asset, check if its normalized URL appears in the head scripts list from step 5. If yes, set `inject_in_head = true`. Otherwise, `inject_in_head = false`. + +Note: compare normalized URLs — the head scripts list may contain URLs with query params that were stripped during normalization. + +## 12. Build path + +For each asset: +- **Fixed (no wildcards):** `path = "/js-assets/{publisher_prefix}/{asset_stem}.js"` +- **Wildcard:** `path = "/js-assets/{publisher_prefix}/*"` + +Where `publisher_prefix` is the 8-char prefix from the slug, and `asset_stem` is the filename without extension from the URL. + +## 13. Generate output + +### Init mode (no `--diff`) + +Write `js-assets.toml` to the repo root using the `Write` tool: + +```toml +# Generated by /audit-js-assets on YYYY-MM-DD +# Publisher: {publisher.domain} +# Source URL: {target_url} + +[[js_assets]] +# {original_url} +slug = "{slug}" +path = "{path}" +origin_url = "{normalized_origin_url_with_wildcards}" +inject_in_head = {true|false} +``` + +Add the comment `# {original_url} (wildcard detected)` above entries with wildcard substitution. + +### Diff mode (`--diff`) + +1. Read the existing `js-assets.toml` with the `Read` tool +2. Parse existing entries by `origin_url` (after normalizing both) +3. Classify each asset: + - **Confirmed:** in both sweep and file + - **New:** in sweep but not in file → append as commented-out TOML block + - **Missing:** in file but not in sweep → flag in terminal only, do NOT modify the file +4. Append new entries to `js-assets.toml` as comments: + +```toml +# --- NEW (detected by /audit-js-assets --diff on YYYY-MM-DD, uncomment to activate) --- +# [[js_assets]] +# # {original_url} +# slug = "{slug}" +# path = "{path}" +# origin_url = "{normalized_origin_url_with_wildcards}" +# inject_in_head = {true|false} +``` + +## 14. Terminal summary + +Print a formatted summary to the user. + +### Init mode + +``` +JS Asset Audit — {publisher.domain} +──────────────────────────────── +Detected: {total} third-party JS requests +Filtered: {filtered_count} ({host} ×{count}, ...) +Surfaced: {surfaced_count} assets → js-assets.toml + + {prefix} inject_in_head={true|false} {host}/.../{filename} + {prefix} inject_in_head={true|false} {host}/.../{filename} [wildcard] + ... + +Review inject_in_head values and commit js-assets.toml when ready. +Diff mode: /audit-js-assets --diff +``` + +### Diff mode + +``` +JS Asset Audit (diff) — {publisher.domain} +──────────────────────────────── +Confirmed: {count} assets still present on page +New: {count} asset(s) detected (appended as comment to js-assets.toml) +Missing: {count} asset(s) no longer seen on page ⚠ + + NEW {prefix} {host}/.../{filename} → review in js-assets.toml + MISSING {prefix} {host}/.../{filename} → may have been removed or renamed +``` + +## 15. Cleanup + +Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page` to close the browser tab. diff --git a/.claude/settings.json b/.claude/settings.json index 02b602d4..77de0b65 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -23,9 +23,13 @@ "Bash(git diff:*)", "Bash(git log:*)", "Bash(git status:*)", + "Bash(node scripts/js-asset-slug.mjs:*)", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page", - "mcp__plugin_chrome-devtools-mcp_chrome-devtools__performance_stop_trace", + "mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page", + "mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script", + "mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page", + "mcp__plugin_chrome-devtools-mcp_chrome-devtools__performance_stop_trace", ] }, "enabledPlugins": { diff --git a/scripts/js-asset-slug.mjs b/scripts/js-asset-slug.mjs new file mode 100755 index 00000000..1169abe9 --- /dev/null +++ b/scripts/js-asset-slug.mjs @@ -0,0 +1,89 @@ +#!/usr/bin/env node + +// JS Asset Slug Generator +// +// Shared utility for generating deterministic slugs for js-assets.toml entries. +// Used by the /audit-js-assets command and must produce identical output to the +// Rust proxy's KV key derivation. +// +// Algorithm: +// publisher_prefix = first_8_chars(base62(sha256(domain + "|" + url))) +// asset_stem = filename_without_extension(url) +// slug = "{publisher_prefix}:{asset_stem}" +// +// base62 charset: 0-9A-Za-z (digits first, then uppercase, then lowercase) +// +// Usage: +// node scripts/js-asset-slug.mjs +// node scripts/js-asset-slug.mjs test-publisher.com https://vendor.io/sdk/loader.js +// # Output: <8-char-prefix>:loader + +import { createHash } from "node:crypto"; +import { posix } from "node:path"; + +const BASE62_CHARSET = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +function bufferToBase62(buffer) { + // Treat the buffer as a big-endian unsigned integer and convert to base62. + let num = 0n; + for (const byte of buffer) { + num = (num << 8n) | BigInt(byte); + } + + if (num === 0n) return "0"; + + const chars = []; + while (num > 0n) { + chars.push(BASE62_CHARSET[Number(num % 62n)]); + num = num / 62n; + } + + return chars.reverse().join(""); +} + +function extractAssetStem(originUrl) { + let pathname; + try { + pathname = new URL(originUrl).pathname; + } catch { + pathname = originUrl; + } + + // Remove trailing slash + if (pathname.endsWith("/")) { + pathname = pathname.slice(0, -1); + } + + const basename = posix.basename(pathname); + if (!basename || basename === "/") { + // Fallback: use last non-empty path segment + const segments = pathname.split("/").filter(Boolean); + const last = segments.at(-1) || "unknown"; + const dot = last.lastIndexOf("."); + return dot > 0 ? last.slice(0, dot) : last; + } + + const dot = basename.lastIndexOf("."); + return dot > 0 ? basename.slice(0, dot) : basename; +} + +function generateSlug(publisherDomain, originUrl) { + const input = `${publisherDomain}|${originUrl}`; + const digest = createHash("sha256").update(input).digest(); + const base62 = bufferToBase62(digest); + const publisherPrefix = base62.slice(0, 8); + const assetStem = extractAssetStem(originUrl); + return `${publisherPrefix}:${assetStem}`; +} + +const [publisherDomain, originUrl] = process.argv.slice(2); + +if (!publisherDomain || !originUrl) { + console.error( + "Usage: node scripts/js-asset-slug.mjs ", + ); + process.exit(1); +} + +console.log(generateSlug(publisherDomain, originUrl)); From cc234102efcfa6b74f3ebb76d07e8cb3d233607f Mon Sep 17 00:00:00 2001 From: Christian Date: Fri, 10 Apr 2026 18:56:54 -0500 Subject: [PATCH 04/12] Add processing script and expand heuristic filters for JS Asset Auditor Move URL normalization, filtering, wildcard detection, slug generation, and TOML formatting into scripts/audit-js-assets.mjs. The skill now collects raw browser data and delegates processing to the script, replacing fragile LLM-side URL manipulation. Expand heuristic filter with Google ad rendering, ad fraud detection, ad verification, and reCAPTCHA categories. Auto-include target URL host as first-party. Add --no-filter flag. Fix semver regex to match alpha suffixes like 1.19.8-hcskhn. --- .claude/commands/audit-js-assets.md | 153 ++--- .claude/settings.json | 3 +- .../2026-04-01-js-asset-auditor-design.md | 44 +- scripts/audit-js-assets.mjs | 524 ++++++++++++++++++ 4 files changed, 593 insertions(+), 131 deletions(-) create mode 100644 scripts/audit-js-assets.mjs diff --git a/.claude/commands/audit-js-assets.md b/.claude/commands/audit-js-assets.md index 57b5a5e8..b14fc907 100644 --- a/.claude/commands/audit-js-assets.md +++ b/.claude/commands/audit-js-assets.md @@ -2,12 +2,13 @@ Audit a publisher page for third-party JS assets and generate `js-assets.toml` e Usage: /audit-js-assets $ARGUMENTS -`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...]` +`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...] [--no-filter]` - `` — publisher page URL (required) - `--diff` — compare sweep against existing `js-assets.toml` instead of generating from scratch - `--settle ` — settle window in milliseconds after page load (default: 6000) - `--first-party ,...` — additional hosts to treat as first-party (comma-separated) +- `--no-filter` — bypass heuristic filtering for full visibility --- @@ -15,7 +16,7 @@ Follow these steps exactly. Stop and report if any step fails. ## 1. Parse arguments -Extract the URL from `$ARGUMENTS` (required — error if missing). Parse optional flags: `--diff` (boolean), `--settle ` (integer, default 6000), `--first-party ,...` (comma-separated list). +Extract the URL from `$ARGUMENTS` (required — error if missing). Parse optional flags: `--diff` (boolean), `--settle ` (integer, default 6000), `--first-party ,...` (comma-separated list), `--no-filter` (boolean). ## 2. Read publisher config @@ -32,7 +33,7 @@ Use the `Read` tool on `trusted-server.toml` in the repo root. Extract the `doma Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` with: ```js -await new Promise(r => setTimeout(r, SETTLE_MS)) +async () => { await new Promise(r => setTimeout(r, SETTLE_MS)); return "settled"; } ``` Replace `SETTLE_MS` with the `--settle` value (default 6000). @@ -48,132 +49,48 @@ Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests` wi Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` with: ```js -Array.from(document.head.querySelectorAll('script[src]')).map(s => s.src) +() => { return Array.from(document.head.querySelectorAll('script[src]')).map(s => s.src); } ``` -Save the resulting array — this determines `inject_in_head` later. +Save the resulting array. -## 6. URL normalization +## 6. Process assets -For each captured script URL, normalize it: +Write a JSON file containing the collected data: -1. Strip the fragment (`#` and everything after) -2. Strip all query parameters (`?` and everything after) -3. Strip trailing slash from the path - -Use the **normalized** URL for all subsequent steps (filtering, slug generation, `origin_url` output). - -## 7. First-party filtering - -For each normalized URL, parse the hostname. Strip a leading `www.` from both the URL's host and `publisher.domain`. If they match exactly, exclude the URL. Also exclude URLs whose host (after `www.` stripping) matches any `--first-party` host. - -Count and track excluded URLs — they don't appear in output but don't appear in the filtered summary either. - -## 8. Heuristic filtering - -Exclude URLs whose host matches any entry below using **dot-boundary suffix matching**: the URL's host must either equal the filter entry or end with `.` + the filter entry. For example, `sentry.io` matches `sentry.io` and `o123.ingest.sentry.io` but not `notsentry.io`. - -| Category | Excluded hosts | -|---|---| -| Framework CDNs | `cdnjs.cloudflare.com`, `ajax.googleapis.com`, `cdn.jsdelivr.net`, `unpkg.com` | -| Error tracking | `sentry.io`, `bugsnag.com`, `rollbar.com` | -| Font services | `fonts.googleapis.com`, `fonts.gstatic.com` | -| Social embeds | `platform.twitter.com`, `platform.x.com`, `connect.facebook.net` | - -**`googletagmanager.com` is NOT filtered** — GTM is ad tech and should be proxied. - -Track each filtered URL with its category and host for the terminal summary. - -## 9. Wildcard detection - -For each surviving URL, check each path segment (split by `/`) against these patterns. Replace matching segments with `*`: - -- **Semver:** `/^\d+\.\d+[\.\d-]*$/` (e.g., `1.19.8-hcskhn`) -- **Hex hash:** `/^[a-f0-9]{8,}$/` (lowercase hex, 8+ chars) -- **Mixed alphanumeric hash:** `/^[A-Za-z0-9]{8,}$/` AND the segment must contain at least one digit AND at least one letter (excludes dictionary words like `analytics`) - -If any segment was wildcarded, save the **original** URL (before substitution) as a comment for the TOML entry. - -## 10. Slug generation - -For each surviving asset, generate a slug by running: - -```bash -node scripts/js-asset-slug.mjs "" "" +```json +{"networkUrls": [], "headUrls": []} ``` -The output is the full slug (e.g., `ZSZksDbq:prebid-load`). Extract the part before `:` as the `publisher_prefix` for the path field. - -## 11. Determine `inject_in_head` - -For each asset, check if its normalized URL appears in the head scripts list from step 5. If yes, set `inject_in_head = true`. Otherwise, `inject_in_head = false`. - -Note: compare normalized URLs — the head scripts list may contain URLs with query params that were stripped during normalization. - -## 12. Build path - -For each asset: -- **Fixed (no wildcards):** `path = "/js-assets/{publisher_prefix}/{asset_stem}.js"` -- **Wildcard:** `path = "/js-assets/{publisher_prefix}/*"` +Use the `Write` tool to create `/tmp/audit-input.json`, then run: -Where `publisher_prefix` is the 8-char prefix from the slug, and `asset_stem` is the filename without extension from the URL. - -## 13. Generate output - -### Init mode (no `--diff`) - -Write `js-assets.toml` to the repo root using the `Write` tool: - -```toml -# Generated by /audit-js-assets on YYYY-MM-DD -# Publisher: {publisher.domain} -# Source URL: {target_url} - -[[js_assets]] -# {original_url} -slug = "{slug}" -path = "{path}" -origin_url = "{normalized_origin_url_with_wildcards}" -inject_in_head = {true|false} +```bash +cat /tmp/audit-input.json | node scripts/audit-js-assets.mjs \ + --domain "" \ + --target "" \ + --output js-assets.toml \ + [--diff] \ + [--first-party ] \ + [--no-filter] ``` -Add the comment `# {original_url} (wildcard detected)` above entries with wildcard substitution. - -### Diff mode (`--diff`) - -1. Read the existing `js-assets.toml` with the `Read` tool -2. Parse existing entries by `origin_url` (after normalizing both) -3. Classify each asset: - - **Confirmed:** in both sweep and file - - **New:** in sweep but not in file → append as commented-out TOML block - - **Missing:** in file but not in sweep → flag in terminal only, do NOT modify the file -4. Append new entries to `js-assets.toml` as comments: - -```toml -# --- NEW (detected by /audit-js-assets --diff on YYYY-MM-DD, uncomment to activate) --- -# [[js_assets]] -# # {original_url} -# slug = "{slug}" -# path = "{path}" -# origin_url = "{normalized_origin_url_with_wildcards}" -# inject_in_head = {true|false} -``` +The script writes TOML to the output file and prints a JSON summary to stdout. -## 14. Terminal summary +## 7. Terminal summary -Print a formatted summary to the user. +Parse the JSON summary from step 6 and print a formatted report. ### Init mode ``` -JS Asset Audit — {publisher.domain} +JS Asset Audit — {publisherDomain} ──────────────────────────────── -Detected: {total} third-party JS requests -Filtered: {filtered_count} ({host} ×{count}, ...) -Surfaced: {surfaced_count} assets → js-assets.toml +Detected: {totalDetected} third-party JS requests +Filtered: {heuristicFilteredTotal} ({host} x{count}, ...) +Surfaced: {surfaced} assets → js-assets.toml - {prefix} inject_in_head={true|false} {host}/.../{filename} - {prefix} inject_in_head={true|false} {host}/.../{filename} [wildcard] + {prefix} inject_in_head={true|false} {shortUrl} + {prefix} inject_in_head={true|false} {shortUrl} [wildcard] ... Review inject_in_head values and commit js-assets.toml when ready. @@ -183,16 +100,16 @@ Diff mode: /audit-js-assets --diff ### Diff mode ``` -JS Asset Audit (diff) — {publisher.domain} +JS Asset Audit (diff) — {publisherDomain} ──────────────────────────────── -Confirmed: {count} assets still present on page -New: {count} asset(s) detected (appended as comment to js-assets.toml) -Missing: {count} asset(s) no longer seen on page ⚠ +Confirmed: {confirmed.length} assets still present on page +New: {new.length} asset(s) detected (appended as comment to js-assets.toml) +Missing: {missing.length} asset(s) no longer seen on page ⚠ - NEW {prefix} {host}/.../{filename} → review in js-assets.toml - MISSING {prefix} {host}/.../{filename} → may have been removed or renamed + NEW {prefix} {shortUrl} → review in js-assets.toml + MISSING {slug} {originUrl} → may have been removed or renamed ``` -## 15. Cleanup +## 8. Cleanup Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page` to close the browser tab. diff --git a/.claude/settings.json b/.claude/settings.json index 77de0b65..3168ad34 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -24,12 +24,13 @@ "Bash(git log:*)", "Bash(git status:*)", "Bash(node scripts/js-asset-slug.mjs:*)", + "Bash(node scripts/audit-js-assets.mjs:*)", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page", - "mcp__plugin_chrome-devtools-mcp_chrome-devtools__performance_stop_trace", + "mcp__plugin_chrome-devtools-mcp_chrome-devtools__performance_stop_trace" ] }, "enabledPlugins": { diff --git a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md index aae3db57..6bbb567f 100644 --- a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md +++ b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md @@ -24,6 +24,7 @@ It also runs as a monitoring tool — `--diff` mode compares a new sweep against /audit-js-assets https://www.publisher.com # init — generate js-assets.toml /audit-js-assets https://www.publisher.com --diff # diff — compare against existing file /audit-js-assets https://www.publisher.com --settle 15000 # longer settle for ad-tech-heavy pages +/audit-js-assets https://www.publisher.com --no-filter # bypass heuristic filtering ``` --- @@ -54,6 +55,8 @@ A network request is **first-party** if the request URL's host, after stripping Publisher-owned CDN subdomains (e.g., `cdn.publisher.com`, `static.publisher.com`) are treated as third-party by default. If the publisher wants to exclude them, they can be added to a `first_party_hosts` list in the command invocation (e.g., `--first-party cdn.publisher.com`). +**Auto-detection:** The target URL's hostname is automatically included as first-party, in addition to `publisher.domain` from `trusted-server.toml`. This ensures that auditing `https://golf.com` when `publisher.domain = "test-publisher.com"` correctly excludes `golf.com` scripts without requiring `--first-party golf.com`. + ### URL normalization Applied to every captured script URL before slug generation and before persisting `origin_url`: @@ -72,15 +75,26 @@ The following origin categories are excluded silently. The terminal summary repo **Matching:** Filter entries match if the request URL's host ends with the filter entry, with a dot-boundary check. For example, `googletagmanager.com` in the filter matches `www.googletagmanager.com` but not `evil-googletagmanager.com`. -| Category | Excluded origins | -| -------------- | ------------------------------------------------------------------------------ | -| Framework CDNs | `cdnjs.cloudflare.com`, `ajax.googleapis.com`, `cdn.jsdelivr.net`, `unpkg.com` | -| Error tracking | `sentry.io`, `bugsnag.com`, `rollbar.com` | -| Font services | `fonts.googleapis.com`, `fonts.gstatic.com` | -| Social embeds | `platform.twitter.com`, `platform.x.com`, `connect.facebook.net` | +| Category | Excluded origins | +| ------------------- | --------------------------------------------------------------------------------------------- | +| Framework CDNs | `cdnjs.cloudflare.com`, `ajax.googleapis.com`, `cdn.jsdelivr.net`, `unpkg.com` | +| Error tracking | `sentry.io`, `bugsnag.com`, `rollbar.com` | +| Font services | `fonts.googleapis.com`, `fonts.gstatic.com` | +| Social embeds | `platform.twitter.com`, `platform.x.com`, `connect.facebook.net` | +| Google ad rendering | `pagead2.googlesyndication.com`, `tpc.googlesyndication.com`, `s0.2mdn.net`, | +| | `googleads.g.doubleclick.net`, `www.googleadservices.com` | +| Ad fraud detection | `adtrafficquality.google` | +| Ad verification | `adsafeprotected.com`, `moatads.com`, `doubleverify.com` | +| reCAPTCHA | `recaptcha.net`, `www.google.com/recaptcha/*`, `www.gstatic.com/recaptcha/*` | + +**Path-prefix matching:** Some hosts (e.g., `www.google.com`) serve both filterable and non-filterable resources. Entries with a path suffix (e.g., `www.google.com/recaptcha/*`) match only when the URL's path begins with the specified prefix. Plain host entries use dot-boundary suffix matching as before. **`googletagmanager.com` is not filtered** — GTM is ad tech and should be proxied. +**`securepubads.g.doubleclick.net` is not filtered** — this is the GPT ad server SDK. Publishers deliberately place this tag. Its sub-resources (e.g., `pubads_impl.js`) are also intentional. The filter targets ad-rendering infrastructure (iframes, creatives, verification), not ad-serving SDKs. + +**`--no-filter`** bypasses heuristic filtering entirely, surfacing all non-first-party scripts. First-party filtering always applies. + Everything else surfaces for operator review. --- @@ -116,7 +130,7 @@ The pipe (`|`) separator is required — it cannot appear in domain names or at Path segments matching any of these patterns are replaced with `*`: -- Semver: `\d+\.\d+[\.\d-]*` (e.g., `1.19.8-hcskhn`) +- Semver: `\d+\.\d+[\.\d\w-]*` (e.g., `1.19.8-hcskhn`) - Hex hash: `[a-f0-9]{8,}` between path separators (lowercase hex, minimum 8 characters) - Mixed alphanumeric hash: `[A-Za-z0-9]{8,}` between path separators, **must contain at least one digit and at least one letter** — this excludes pure-alpha dictionary words like `analytics` or `bootstrap` @@ -209,9 +223,12 @@ Missing: 1 asset no longer seen on page ⚠ ## Implementation -The Auditor is a Claude Code skill file. No compiled code. +The Auditor has two components: + +1. **Skill file** (`.claude/commands/audit-js-assets.md`) — Drives the browser via Chrome DevTools MCP, collects raw script URLs, and calls the processing script. ~100 lines. +2. **Processing script** (`scripts/audit-js-assets.mjs`) — Pure Node.js script (no external dependencies) that performs URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation, and TOML formatting. Takes raw data on stdin (JSON with `networkUrls` and `headUrls`), writes TOML to a file, and prints a JSON summary to stdout. -**Skill location:** `.claude/commands/audit-js-assets.md` +This split ensures deterministic, testable processing (the script) while keeping browser automation in the LLM's domain (the skill). **MCP tools used:** @@ -221,12 +238,15 @@ The Auditor is a Claude Code skill file. No compiled code. - `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` — settle window + detect head-loaded scripts via DOM query - `mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page` — clean up tab -**Permission grants required:** `navigate_page`, `list_network_requests`, and `close_page` are not currently approved in `.claude/settings.json`. Add them to `permissions.allow` before running the skill, or expect interactive permission prompts on first run. +**Standalone utilities:** + +- `scripts/js-asset-slug.mjs` — Standalone slug generator for individual URLs +- `scripts/audit-js-assets.mjs` — Full audit processing pipeline **File tools used:** -- `Read` — read `trusted-server.toml` (publisher domain) and existing `js-assets.toml` (diff mode) -- `Write` — write generated/updated `js-assets.toml` +- `Read` — read `trusted-server.toml` (publisher domain) +- `Write` — write input JSON for processing script --- diff --git a/scripts/audit-js-assets.mjs b/scripts/audit-js-assets.mjs new file mode 100644 index 00000000..14c769e2 --- /dev/null +++ b/scripts/audit-js-assets.mjs @@ -0,0 +1,524 @@ +#!/usr/bin/env node + +// JS Asset Auditor — Processing Script +// +// Takes raw browser data (network script URLs + head script URLs) on stdin, +// applies normalization, filtering, wildcard detection, and slug generation, +// then writes a js-assets.toml file and prints a JSON summary to stdout. +// +// Usage: +// cat input.json | node scripts/audit-js-assets.mjs \ +// --domain --target \ +// [--output js-assets.toml] [--diff] [--first-party ] [--no-filter] +// +// Stdin format: +// {"networkUrls": ["https://..."], "headUrls": ["https://..."]} +// +// The slug algorithm is duplicated from scripts/js-asset-slug.mjs. Both files +// must produce identical output. Any changes must be synchronized. + +import { createHash } from "node:crypto"; +import { posix } from "node:path"; +import { readFileSync, writeFileSync } from "node:fs"; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +const BASE62_CHARSET = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +// Heuristic filter: host-only entries use dot-boundary suffix matching. +// Entries with a `pathPrefix` also require the URL path to start with it. +const HEURISTIC_FILTERS = { + "Framework CDNs": ["cdnjs.cloudflare.com", "ajax.googleapis.com", "cdn.jsdelivr.net", "unpkg.com"], + "Error tracking": ["sentry.io", "bugsnag.com", "rollbar.com"], + "Font services": ["fonts.googleapis.com", "fonts.gstatic.com"], + "Social embeds": ["platform.twitter.com", "platform.x.com", "connect.facebook.net"], + "Google ad rendering": [ + "pagead2.googlesyndication.com", + "tpc.googlesyndication.com", + "s0.2mdn.net", + "googleads.g.doubleclick.net", + "www.googleadservices.com", + ], + "Ad fraud detection": ["adtrafficquality.google"], + "Ad verification": ["adsafeprotected.com", "moatads.com", "doubleverify.com"], + reCAPTCHA: [ + "recaptcha.net", + { host: "www.google.com", pathPrefix: "/recaptcha/" }, + { host: "www.gstatic.com", pathPrefix: "/recaptcha/" }, + ], +}; + +const SEMVER_RE = /^\d+\.\d+[\.\d\w-]*$/; +const HEX_HASH_RE = /^[a-f0-9]{8,}$/; +const MIXED_HASH_RE = /^[A-Za-z0-9]{8,}$/; + +// --------------------------------------------------------------------------- +// Slug generation (duplicated from scripts/js-asset-slug.mjs) +// --------------------------------------------------------------------------- + +function bufferToBase62(buffer) { + let num = 0n; + for (const byte of buffer) { + num = (num << 8n) | BigInt(byte); + } + if (num === 0n) return "0"; + const chars = []; + while (num > 0n) { + chars.push(BASE62_CHARSET[Number(num % 62n)]); + num = num / 62n; + } + return chars.reverse().join(""); +} + +function extractAssetStem(originUrl) { + let pathname; + try { + pathname = new URL(originUrl).pathname; + } catch { + pathname = originUrl; + } + if (pathname.endsWith("/")) pathname = pathname.slice(0, -1); + const basename = posix.basename(pathname); + if (!basename || basename === "/") { + const segments = pathname.split("/").filter(Boolean); + const last = segments.at(-1) || "unknown"; + const dot = last.lastIndexOf("."); + return dot > 0 ? last.slice(0, dot) : last; + } + const dot = basename.lastIndexOf("."); + return dot > 0 ? basename.slice(0, dot) : basename; +} + +function generateSlug(publisherDomain, originUrl) { + const input = `${publisherDomain}|${originUrl}`; + const digest = createHash("sha256").update(input).digest(); + const base62 = bufferToBase62(digest); + const publisherPrefix = base62.slice(0, 8); + const assetStem = extractAssetStem(originUrl); + return `${publisherPrefix}:${assetStem}`; +} + +// --------------------------------------------------------------------------- +// URL processing +// --------------------------------------------------------------------------- + +function normalizeUrl(raw) { + let url = raw; + // Fix protocol-relative URLs + if (url.startsWith("//")) url = "https:" + url; + // Strip fragment + const hashIdx = url.indexOf("#"); + if (hashIdx !== -1) url = url.slice(0, hashIdx); + // Strip query params + const qIdx = url.indexOf("?"); + if (qIdx !== -1) url = url.slice(0, qIdx); + // Strip trailing slash + if (url.endsWith("/")) url = url.slice(0, -1); + return url; +} + +function stripWww(host) { + return host.startsWith("www.") ? host.slice(4) : host; +} + +function isFirstParty(hostname, publisherDomain, targetHost, extraHosts) { + const stripped = stripWww(hostname); + if (stripped === stripWww(publisherDomain)) return true; + if (stripped === stripWww(targetHost)) return true; + for (const h of extraHosts) { + if (stripped === stripWww(h)) return true; + } + return false; +} + +function dotBoundaryMatch(hostname, filterEntry) { + return hostname === filterEntry || hostname.endsWith("." + filterEntry); +} + +function matchesHeuristicFilter(hostname, pathname) { + for (const [category, entries] of Object.entries(HEURISTIC_FILTERS)) { + for (const entry of entries) { + if (typeof entry === "string") { + if (dotBoundaryMatch(hostname, entry)) { + return { category, entry }; + } + } else { + // Path-prefix filter: {host, pathPrefix} + if ( + dotBoundaryMatch(hostname, entry.host) && + pathname.startsWith(entry.pathPrefix) + ) { + return { category, entry: `${entry.host}${entry.pathPrefix}*` }; + } + } + } + } + return null; +} + +// --------------------------------------------------------------------------- +// Wildcard detection +// --------------------------------------------------------------------------- + +function applyWildcards(url) { + let parsed; + try { + parsed = new URL(url); + } catch { + return { wildcarded: url, original: null, hasWildcard: false }; + } + const segments = parsed.pathname.split("/"); + let hasWildcard = false; + const newSegments = segments.map((seg) => { + if (!seg) return seg; + if (SEMVER_RE.test(seg)) { + hasWildcard = true; + return "*"; + } + if (HEX_HASH_RE.test(seg)) { + hasWildcard = true; + return "*"; + } + if ( + MIXED_HASH_RE.test(seg) && + /\d/.test(seg) && + /[a-zA-Z]/.test(seg) + ) { + hasWildcard = true; + return "*"; + } + return seg; + }); + const wildcarded = parsed.origin + newSegments.join("/"); + return { wildcarded, original: hasWildcard ? url : null, hasWildcard }; +} + +// --------------------------------------------------------------------------- +// TOML formatting +// --------------------------------------------------------------------------- + +function formatTomlEntry(asset, commented = false) { + const pfx = commented ? "# " : ""; + let block = ""; + if (asset.hasWildcard && asset.originalUrl) { + block += `${pfx}# ${asset.originalUrl} (wildcard detected)\n`; + } + block += `${pfx}slug = "${asset.slug}"\n`; + block += `${pfx}path = "${asset.path}"\n`; + block += `${pfx}origin_url = "${asset.originUrl}"\n`; + block += `${pfx}inject_in_head = ${asset.injectInHead}\n`; + return block; +} + +function shortenUrl(url) { + let parsed; + try { + parsed = new URL(url); + } catch { + return url; + } + const parts = parsed.pathname.split("/").filter(Boolean); + const filename = parts.at(-1) || parsed.pathname; + return `${parsed.hostname}/.../` + filename; +} + +// --------------------------------------------------------------------------- +// Diff mode: parse existing TOML +// --------------------------------------------------------------------------- + +function parseExistingToml(content) { + const entries = []; + const blocks = content.split("[[js_assets]]"); + // Skip the first element (preamble before the first [[js_assets]]) + for (let i = 1; i < blocks.length; i++) { + const block = blocks[i]; + const originMatch = block.match(/^origin_url\s*=\s*"([^"]+)"/m); + const slugMatch = block.match(/^slug\s*=\s*"([^"]+)"/m); + if (originMatch) { + entries.push({ + originUrl: originMatch[1], + slug: slugMatch ? slugMatch[1] : "", + }); + } + } + return entries; +} + +// --------------------------------------------------------------------------- +// CLI argument parsing +// --------------------------------------------------------------------------- + +function parseArgs(argv) { + const args = { + domain: null, + target: null, + output: "js-assets.toml", + diff: false, + firstParty: [], + noFilter: false, + }; + + for (let i = 2; i < argv.length; i++) { + switch (argv[i]) { + case "--domain": + args.domain = argv[++i]; + break; + case "--target": + args.target = argv[++i]; + break; + case "--output": + args.output = argv[++i]; + break; + case "--diff": + args.diff = true; + break; + case "--first-party": + args.firstParty = argv[++i].split(",").filter(Boolean); + break; + case "--no-filter": + args.noFilter = true; + break; + default: + console.error(`Unknown argument: ${argv[i]}`); + process.exit(1); + } + } + + if (!args.domain || !args.target) { + console.error( + "Usage: cat input.json | node scripts/audit-js-assets.mjs --domain --target [--output file] [--diff] [--first-party hosts] [--no-filter]", + ); + process.exit(1); + } + + return args; +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +async function main() { + const args = parseArgs(process.argv); + + // Read stdin + const chunks = []; + for await (const chunk of process.stdin) chunks.push(chunk); + const input = JSON.parse(Buffer.concat(chunks).toString()); + const { networkUrls: rawNetworkUrls, headUrls: rawHeadUrls } = input; + + // Determine target host for auto first-party detection + let targetHost = ""; + try { + targetHost = new URL(args.target).hostname; + } catch { + // If the target isn't a full URL, use it as-is + targetHost = args.target; + } + + // Step 1: Normalize and deduplicate + const normalizedNetwork = [ + ...new Set(rawNetworkUrls.map(normalizeUrl)), + ]; + const normalizedHead = new Set(rawHeadUrls.map(normalizeUrl)); + + // Step 2: First-party filter + const firstPartyFiltered = []; + const thirdPartyUrls = []; + + for (const url of normalizedNetwork) { + let hostname; + try { + hostname = new URL(url).hostname; + } catch { + continue; + } + if (isFirstParty(hostname, args.domain, targetHost, args.firstParty)) { + firstPartyFiltered.push({ url, host: hostname }); + } else { + thirdPartyUrls.push(url); + } + } + + // Step 3: Heuristic filter + const heuristicFiltered = []; + const survivingUrls = []; + + for (const url of thirdPartyUrls) { + let hostname, pathname; + try { + const parsed = new URL(url); + hostname = parsed.hostname; + pathname = parsed.pathname; + } catch { + survivingUrls.push(url); + continue; + } + + if (args.noFilter) { + survivingUrls.push(url); + continue; + } + + const match = matchesHeuristicFilter(hostname, pathname); + if (match) { + heuristicFiltered.push({ url, host: hostname, ...match }); + } else { + survivingUrls.push(url); + } + } + + // Aggregate filter counts by host + const filterCounts = {}; + for (const f of heuristicFiltered) { + filterCounts[f.host] = (filterCounts[f.host] || 0) + 1; + } + + // Step 4: Process surviving URLs + const assets = []; + const seenOrigins = new Set(); + + for (const url of survivingUrls) { + const { wildcarded, original, hasWildcard } = applyWildcards(url); + + // Deduplicate by wildcarded origin URL + if (seenOrigins.has(wildcarded)) continue; + seenOrigins.add(wildcarded); + + const slug = generateSlug(args.domain, wildcarded); + const prefix = slug.split(":")[0]; + const injectInHead = normalizedHead.has(url); + + let path; + if (hasWildcard) { + path = `/js-assets/${prefix}/*`; + } else { + const stem = extractAssetStem(wildcarded); + path = `/js-assets/${prefix}/${stem}.js`; + } + + let hostname; + try { + hostname = new URL(url).hostname; + } catch { + hostname = "unknown"; + } + + assets.push({ + slug, + prefix, + path, + originUrl: wildcarded, + originalUrl: original, + injectInHead, + hasWildcard, + host: hostname, + shortUrl: shortenUrl(wildcarded), + }); + } + + // Step 5: Generate output + const today = new Date().toISOString().slice(0, 10); + + if (args.diff) { + // Diff mode + let existingContent; + try { + existingContent = readFileSync(args.output, "utf-8"); + } catch { + console.error(`Error: cannot read ${args.output} for diff mode`); + process.exit(1); + } + + const existingEntries = parseExistingToml(existingContent); + const existingOrigins = new Set( + existingEntries.map((e) => e.originUrl), + ); + const sweepOrigins = new Set(assets.map((a) => a.originUrl)); + + const confirmed = existingEntries.filter((e) => + sweepOrigins.has(e.originUrl), + ); + const missing = existingEntries.filter( + (e) => !sweepOrigins.has(e.originUrl), + ); + const newAssets = assets.filter( + (a) => !existingOrigins.has(a.originUrl), + ); + + // Append new entries as comments + if (newAssets.length > 0) { + let appendBlock = `\n# --- NEW (detected by /audit-js-assets --diff on ${today}, uncomment to activate) ---\n`; + for (const a of newAssets) { + appendBlock += `\n# [[js_assets]]\n`; + appendBlock += formatTomlEntry(a, true); + } + writeFileSync(args.output, existingContent + appendBlock); + } + + // Print diff summary + const summary = { + mode: "diff", + publisherDomain: args.domain, + targetUrl: args.target, + confirmed: confirmed.map((e) => ({ + slug: e.slug, + originUrl: e.originUrl, + })), + new: newAssets.map((a) => ({ + slug: a.slug, + prefix: a.prefix, + shortUrl: a.shortUrl, + originUrl: a.originUrl, + })), + missing: missing.map((e) => ({ + slug: e.slug, + originUrl: e.originUrl, + })), + outputFile: args.output, + }; + console.log(JSON.stringify(summary)); + } else { + // Init mode + let toml = `# Generated by /audit-js-assets on ${today}\n`; + toml += `# Publisher: ${args.domain}\n`; + toml += `# Source URL: ${args.target}\n`; + + for (const a of assets) { + toml += `\n[[js_assets]]\n`; + toml += formatTomlEntry(a); + } + + writeFileSync(args.output, toml); + + // Build filter summary entries + const filterSummary = Object.entries(filterCounts).map( + ([host, count]) => ({ host, count }), + ); + + const summary = { + mode: "init", + publisherDomain: args.domain, + targetUrl: args.target, + totalDetected: thirdPartyUrls.length, + firstPartyFiltered: firstPartyFiltered.length, + firstPartyHost: targetHost, + heuristicFiltered: filterSummary, + heuristicFilteredTotal: heuristicFiltered.length, + surfaced: assets.length, + assets: assets.map((a) => ({ + prefix: a.prefix, + injectInHead: a.injectInHead, + shortUrl: a.shortUrl, + wildcard: a.hasWildcard, + })), + outputFile: args.output, + }; + console.log(JSON.stringify(summary)); + } +} + +main(); From 36c75d498dc4a8467271bb85a0fe2a53d86638db Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 13 Apr 2026 11:52:21 -0500 Subject: [PATCH 05/12] Add Playwright CLI for deterministic JS asset auditing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace MCP-driven browser automation with a standalone Playwright CLI at tools/js-asset-auditor/audit.mjs. One command sweeps a publisher page, collects script URLs, processes them through the shared pipeline, and writes js-assets.toml. Refactor scripts/audit-js-assets.mjs to export processAssets() so both the stdin-based pipeline and the Playwright CLI share the same processing logic. Simplify the Claude skill from 115 to 59 lines — it now calls the CLI and formats the JSON summary. --- .claude/commands/audit-js-assets.md | 76 +------- .claude/settings.json | 1 + .gitignore | 3 + scripts/audit-js-assets.mjs | 288 +++++++++++++--------------- tools/js-asset-auditor/audit.mjs | 218 +++++++++++++++++++++ tools/js-asset-auditor/package.json | 9 + 6 files changed, 379 insertions(+), 216 deletions(-) create mode 100644 tools/js-asset-auditor/audit.mjs create mode 100644 tools/js-asset-auditor/package.json diff --git a/.claude/commands/audit-js-assets.md b/.claude/commands/audit-js-assets.md index b14fc907..d088c7c0 100644 --- a/.claude/commands/audit-js-assets.md +++ b/.claude/commands/audit-js-assets.md @@ -2,83 +2,31 @@ Audit a publisher page for third-party JS assets and generate `js-assets.toml` e Usage: /audit-js-assets $ARGUMENTS -`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...] [--no-filter]` - -- `` — publisher page URL (required) -- `--diff` — compare sweep against existing `js-assets.toml` instead of generating from scratch -- `--settle ` — settle window in milliseconds after page load (default: 6000) -- `--first-party ,...` — additional hosts to treat as first-party (comma-separated) -- `--no-filter` — bypass heuristic filtering for full visibility +`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...] [--no-filter] [--headed]` --- Follow these steps exactly. Stop and report if any step fails. -## 1. Parse arguments - -Extract the URL from `$ARGUMENTS` (required — error if missing). Parse optional flags: `--diff` (boolean), `--settle ` (integer, default 6000), `--first-party ,...` (comma-separated list), `--no-filter` (boolean). - -## 2. Read publisher config - -Use the `Read` tool on `trusted-server.toml` in the repo root. Extract the `domain` value from the `[publisher]` section. Error if the file is missing or `[publisher].domain` is not found. - -## 3. Open browser and navigate - -1. Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page` to open a new browser tab -2. Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page` with the target URL -3. If navigation fails, close the page and report the error - -## 4. Wait for page settle - -Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` with: - -```js -async () => { await new Promise(r => setTimeout(r, SETTLE_MS)); return "settled"; } -``` - -Replace `SETTLE_MS` with the `--settle` value (default 6000). +## 1. Run the auditor -## 5. Collect data +Run the Playwright CLI via Bash, forwarding all arguments from `$ARGUMENTS`: -Make these two calls in parallel: - -**Network requests:** -Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests` with `resourceTypes: ["script"]`. Save the full list of script URLs. - -**Head scripts:** -Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` with: - -```js -() => { return Array.from(document.head.querySelectorAll('script[src]')).map(s => s.src); } +```bash +node tools/js-asset-auditor/audit.mjs $ARGUMENTS ``` -Save the resulting array. - -## 6. Process assets +The CLI reads `trusted-server.toml` for the publisher domain, opens a headless browser, collects script URLs, processes them, and writes `js-assets.toml`. Progress lines appear on stderr; a JSON summary prints to stdout. -Write a JSON file containing the collected data: - -```json -{"networkUrls": [], "headUrls": []} -``` - -Use the `Write` tool to create `/tmp/audit-input.json`, then run: +If the command fails with "Playwright not installed" or "Chromium not installed", tell the user to run: ```bash -cat /tmp/audit-input.json | node scripts/audit-js-assets.mjs \ - --domain "" \ - --target "" \ - --output js-assets.toml \ - [--diff] \ - [--first-party ] \ - [--no-filter] +cd tools/js-asset-auditor && npm install && npx playwright install chromium ``` -The script writes TOML to the output file and prints a JSON summary to stdout. - -## 7. Terminal summary +## 2. Show results -Parse the JSON summary from step 6 and print a formatted report. +Parse the JSON summary from stdout and print a formatted report. ### Init mode @@ -109,7 +57,3 @@ Missing: {missing.length} asset(s) no longer seen on page ⚠ NEW {prefix} {shortUrl} → review in js-assets.toml MISSING {slug} {originUrl} → may have been removed or renamed ``` - -## 8. Cleanup - -Call `mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page` to close the browser tab. diff --git a/.claude/settings.json b/.claude/settings.json index 3168ad34..8c0f3512 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -25,6 +25,7 @@ "Bash(git status:*)", "Bash(node scripts/js-asset-slug.mjs:*)", "Bash(node scripts/audit-js-assets.mjs:*)", + "Bash(node tools/js-asset-auditor/audit.mjs:*)", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests", diff --git a/.gitignore b/.gitignore index af70c452..282288cd 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,9 @@ src/*.html /guest-profiles /benchmark-results/** +# JS Asset Auditor tool +/tools/js-asset-auditor/node_modules/ + # Playwright browser tests /crates/integration-tests/browser/node_modules/ /crates/integration-tests/browser/test-results/ diff --git a/scripts/audit-js-assets.mjs b/scripts/audit-js-assets.mjs index 14c769e2..8bc7690c 100644 --- a/scripts/audit-js-assets.mjs +++ b/scripts/audit-js-assets.mjs @@ -1,12 +1,13 @@ #!/usr/bin/env node -// JS Asset Auditor — Processing Script +// JS Asset Auditor — Processing Library & CLI // -// Takes raw browser data (network script URLs + head script URLs) on stdin, -// applies normalization, filtering, wildcard detection, and slug generation, -// then writes a js-assets.toml file and prints a JSON summary to stdout. +// Provides URL processing functions (normalization, filtering, wildcard +// detection, slug generation, TOML formatting) used by both the standalone +// Playwright CLI (tools/js-asset-auditor/audit.mjs) and the stdin-based +// pipeline invoked by the Claude Code skill. // -// Usage: +// CLI usage (stdin mode): // cat input.json | node scripts/audit-js-assets.mjs \ // --domain --target \ // [--output js-assets.toml] [--diff] [--first-party ] [--no-filter] @@ -30,7 +31,7 @@ const BASE62_CHARSET = // Heuristic filter: host-only entries use dot-boundary suffix matching. // Entries with a `pathPrefix` also require the URL path to start with it. -const HEURISTIC_FILTERS = { +export const HEURISTIC_FILTERS = { "Framework CDNs": ["cdnjs.cloudflare.com", "ajax.googleapis.com", "cdn.jsdelivr.net", "unpkg.com"], "Error tracking": ["sentry.io", "bugsnag.com", "rollbar.com"], "Font services": ["fonts.googleapis.com", "fonts.gstatic.com"], @@ -73,7 +74,7 @@ function bufferToBase62(buffer) { return chars.reverse().join(""); } -function extractAssetStem(originUrl) { +export function extractAssetStem(originUrl) { let pathname; try { pathname = new URL(originUrl).pathname; @@ -92,7 +93,7 @@ function extractAssetStem(originUrl) { return dot > 0 ? basename.slice(0, dot) : basename; } -function generateSlug(publisherDomain, originUrl) { +export function generateSlug(publisherDomain, originUrl) { const input = `${publisherDomain}|${originUrl}`; const digest = createHash("sha256").update(input).digest(); const base62 = bufferToBase62(digest); @@ -105,17 +106,13 @@ function generateSlug(publisherDomain, originUrl) { // URL processing // --------------------------------------------------------------------------- -function normalizeUrl(raw) { +export function normalizeUrl(raw) { let url = raw; - // Fix protocol-relative URLs if (url.startsWith("//")) url = "https:" + url; - // Strip fragment const hashIdx = url.indexOf("#"); if (hashIdx !== -1) url = url.slice(0, hashIdx); - // Strip query params const qIdx = url.indexOf("?"); if (qIdx !== -1) url = url.slice(0, qIdx); - // Strip trailing slash if (url.endsWith("/")) url = url.slice(0, -1); return url; } @@ -124,7 +121,7 @@ function stripWww(host) { return host.startsWith("www.") ? host.slice(4) : host; } -function isFirstParty(hostname, publisherDomain, targetHost, extraHosts) { +export function isFirstParty(hostname, publisherDomain, targetHost, extraHosts) { const stripped = stripWww(hostname); if (stripped === stripWww(publisherDomain)) return true; if (stripped === stripWww(targetHost)) return true; @@ -138,7 +135,7 @@ function dotBoundaryMatch(hostname, filterEntry) { return hostname === filterEntry || hostname.endsWith("." + filterEntry); } -function matchesHeuristicFilter(hostname, pathname) { +export function matchesHeuristicFilter(hostname, pathname) { for (const [category, entries] of Object.entries(HEURISTIC_FILTERS)) { for (const entry of entries) { if (typeof entry === "string") { @@ -146,7 +143,6 @@ function matchesHeuristicFilter(hostname, pathname) { return { category, entry }; } } else { - // Path-prefix filter: {host, pathPrefix} if ( dotBoundaryMatch(hostname, entry.host) && pathname.startsWith(entry.pathPrefix) @@ -163,7 +159,7 @@ function matchesHeuristicFilter(hostname, pathname) { // Wildcard detection // --------------------------------------------------------------------------- -function applyWildcards(url) { +export function applyWildcards(url) { let parsed; try { parsed = new URL(url); @@ -200,7 +196,7 @@ function applyWildcards(url) { // TOML formatting // --------------------------------------------------------------------------- -function formatTomlEntry(asset, commented = false) { +export function formatTomlEntry(asset, commented = false) { const pfx = commented ? "# " : ""; let block = ""; if (asset.hasWildcard && asset.originalUrl) { @@ -213,7 +209,7 @@ function formatTomlEntry(asset, commented = false) { return block; } -function shortenUrl(url) { +export function shortenUrl(url) { let parsed; try { parsed = new URL(url); @@ -229,10 +225,9 @@ function shortenUrl(url) { // Diff mode: parse existing TOML // --------------------------------------------------------------------------- -function parseExistingToml(content) { +export function parseExistingToml(content) { const entries = []; const blocks = content.split("[[js_assets]]"); - // Skip the first element (preamble before the first [[js_assets]]) for (let i = 1; i < blocks.length; i++) { const block = blocks[i]; const originMatch = block.match(/^origin_url\s*=\s*"([^"]+)"/m); @@ -248,84 +243,24 @@ function parseExistingToml(content) { } // --------------------------------------------------------------------------- -// CLI argument parsing +// Core processing pipeline // --------------------------------------------------------------------------- -function parseArgs(argv) { - const args = { - domain: null, - target: null, - output: "js-assets.toml", - diff: false, - firstParty: [], - noFilter: false, - }; - - for (let i = 2; i < argv.length; i++) { - switch (argv[i]) { - case "--domain": - args.domain = argv[++i]; - break; - case "--target": - args.target = argv[++i]; - break; - case "--output": - args.output = argv[++i]; - break; - case "--diff": - args.diff = true; - break; - case "--first-party": - args.firstParty = argv[++i].split(",").filter(Boolean); - break; - case "--no-filter": - args.noFilter = true; - break; - default: - console.error(`Unknown argument: ${argv[i]}`); - process.exit(1); - } - } - - if (!args.domain || !args.target) { - console.error( - "Usage: cat input.json | node scripts/audit-js-assets.mjs --domain --target [--output file] [--diff] [--first-party hosts] [--no-filter]", - ); - process.exit(1); - } - - return args; -} - -// --------------------------------------------------------------------------- -// Main -// --------------------------------------------------------------------------- - -async function main() { - const args = parseArgs(process.argv); - - // Read stdin - const chunks = []; - for await (const chunk of process.stdin) chunks.push(chunk); - const input = JSON.parse(Buffer.concat(chunks).toString()); +export function processAssets(input, args) { const { networkUrls: rawNetworkUrls, headUrls: rawHeadUrls } = input; - // Determine target host for auto first-party detection let targetHost = ""; try { targetHost = new URL(args.target).hostname; } catch { - // If the target isn't a full URL, use it as-is targetHost = args.target; } - // Step 1: Normalize and deduplicate - const normalizedNetwork = [ - ...new Set(rawNetworkUrls.map(normalizeUrl)), - ]; + // Normalize and deduplicate + const normalizedNetwork = [...new Set(rawNetworkUrls.map(normalizeUrl))]; const normalizedHead = new Set(rawHeadUrls.map(normalizeUrl)); - // Step 2: First-party filter + // First-party filter const firstPartyFiltered = []; const thirdPartyUrls = []; @@ -336,14 +271,14 @@ async function main() { } catch { continue; } - if (isFirstParty(hostname, args.domain, targetHost, args.firstParty)) { + if (isFirstParty(hostname, args.domain, targetHost, args.firstParty || [])) { firstPartyFiltered.push({ url, host: hostname }); } else { thirdPartyUrls.push(url); } } - // Step 3: Heuristic filter + // Heuristic filter const heuristicFiltered = []; const survivingUrls = []; @@ -371,20 +306,18 @@ async function main() { } } - // Aggregate filter counts by host const filterCounts = {}; for (const f of heuristicFiltered) { filterCounts[f.host] = (filterCounts[f.host] || 0) + 1; } - // Step 4: Process surviving URLs + // Process surviving URLs const assets = []; const seenOrigins = new Set(); for (const url of survivingUrls) { const { wildcarded, original, hasWildcard } = applyWildcards(url); - // Deduplicate by wildcarded origin URL if (seenOrigins.has(wildcarded)) continue; seenOrigins.add(wildcarded); @@ -420,86 +353,63 @@ async function main() { }); } - // Step 5: Generate output + // Generate TOML and summary const today = new Date().toISOString().slice(0, 10); if (args.diff) { - // Diff mode let existingContent; try { existingContent = readFileSync(args.output, "utf-8"); } catch { - console.error(`Error: cannot read ${args.output} for diff mode`); - process.exit(1); + return { error: `Cannot read ${args.output} for diff mode` }; } const existingEntries = parseExistingToml(existingContent); - const existingOrigins = new Set( - existingEntries.map((e) => e.originUrl), - ); + const existingOrigins = new Set(existingEntries.map((e) => e.originUrl)); const sweepOrigins = new Set(assets.map((a) => a.originUrl)); - const confirmed = existingEntries.filter((e) => - sweepOrigins.has(e.originUrl), - ); - const missing = existingEntries.filter( - (e) => !sweepOrigins.has(e.originUrl), - ); - const newAssets = assets.filter( - (a) => !existingOrigins.has(a.originUrl), - ); + const confirmed = existingEntries.filter((e) => sweepOrigins.has(e.originUrl)); + const missing = existingEntries.filter((e) => !sweepOrigins.has(e.originUrl)); + const newAssets = assets.filter((a) => !existingOrigins.has(a.originUrl)); - // Append new entries as comments + let appendBlock = ""; if (newAssets.length > 0) { - let appendBlock = `\n# --- NEW (detected by /audit-js-assets --diff on ${today}, uncomment to activate) ---\n`; + appendBlock = `\n# --- NEW (detected by /audit-js-assets --diff on ${today}, uncomment to activate) ---\n`; for (const a of newAssets) { appendBlock += `\n# [[js_assets]]\n`; appendBlock += formatTomlEntry(a, true); } - writeFileSync(args.output, existingContent + appendBlock); } - // Print diff summary - const summary = { - mode: "diff", - publisherDomain: args.domain, - targetUrl: args.target, - confirmed: confirmed.map((e) => ({ - slug: e.slug, - originUrl: e.originUrl, - })), - new: newAssets.map((a) => ({ - slug: a.slug, - prefix: a.prefix, - shortUrl: a.shortUrl, - originUrl: a.originUrl, - })), - missing: missing.map((e) => ({ - slug: e.slug, - originUrl: e.originUrl, - })), - outputFile: args.output, + return { + toml: existingContent + appendBlock, + summary: { + mode: "diff", + publisherDomain: args.domain, + targetUrl: args.target, + confirmed: confirmed.map((e) => ({ slug: e.slug, originUrl: e.originUrl })), + new: newAssets.map((a) => ({ slug: a.slug, prefix: a.prefix, shortUrl: a.shortUrl, originUrl: a.originUrl })), + missing: missing.map((e) => ({ slug: e.slug, originUrl: e.originUrl })), + outputFile: args.output, + }, }; - console.log(JSON.stringify(summary)); - } else { - // Init mode - let toml = `# Generated by /audit-js-assets on ${today}\n`; - toml += `# Publisher: ${args.domain}\n`; - toml += `# Source URL: ${args.target}\n`; - - for (const a of assets) { - toml += `\n[[js_assets]]\n`; - toml += formatTomlEntry(a); - } + } - writeFileSync(args.output, toml); + // Init mode + let toml = `# Generated by /audit-js-assets on ${today}\n`; + toml += `# Publisher: ${args.domain}\n`; + toml += `# Source URL: ${args.target}\n`; - // Build filter summary entries - const filterSummary = Object.entries(filterCounts).map( - ([host, count]) => ({ host, count }), - ); + for (const a of assets) { + toml += `\n[[js_assets]]\n`; + toml += formatTomlEntry(a); + } + + const filterSummary = Object.entries(filterCounts).map(([host, count]) => ({ host, count })); - const summary = { + return { + toml, + summary: { mode: "init", publisherDomain: args.domain, targetUrl: args.target, @@ -516,9 +426,87 @@ async function main() { wildcard: a.hasWildcard, })), outputFile: args.output, - }; - console.log(JSON.stringify(summary)); + }, + }; +} + +// --------------------------------------------------------------------------- +// CLI argument parsing (for stdin mode) +// --------------------------------------------------------------------------- + +function parseArgs(argv) { + const args = { + domain: null, + target: null, + output: "js-assets.toml", + diff: false, + firstParty: [], + noFilter: false, + }; + + for (let i = 2; i < argv.length; i++) { + switch (argv[i]) { + case "--domain": + args.domain = argv[++i]; + break; + case "--target": + args.target = argv[++i]; + break; + case "--output": + args.output = argv[++i]; + break; + case "--diff": + args.diff = true; + break; + case "--first-party": + args.firstParty = argv[++i].split(",").filter(Boolean); + break; + case "--no-filter": + args.noFilter = true; + break; + default: + console.error(`Unknown argument: ${argv[i]}`); + process.exit(1); + } + } + + if (!args.domain || !args.target) { + console.error( + "Usage: cat input.json | node scripts/audit-js-assets.mjs --domain --target [--output file] [--diff] [--first-party hosts] [--no-filter]", + ); + process.exit(1); } + + return args; } -main(); +// --------------------------------------------------------------------------- +// CLI entry point (stdin mode) — only runs when invoked directly +// --------------------------------------------------------------------------- + +async function main() { + const args = parseArgs(process.argv); + + const chunks = []; + for await (const chunk of process.stdin) chunks.push(chunk); + const input = JSON.parse(Buffer.concat(chunks).toString()); + + const result = processAssets(input, args); + + if (result.error) { + console.error(result.error); + process.exit(1); + } + + writeFileSync(args.output, result.toml); + console.log(JSON.stringify(result.summary)); +} + +// Only run main() when this file is executed directly, not when imported +const isDirectExecution = + process.argv[1] && + new URL(process.argv[1], "file://").href === import.meta.url; + +if (isDirectExecution) { + main(); +} diff --git a/tools/js-asset-auditor/audit.mjs b/tools/js-asset-auditor/audit.mjs new file mode 100644 index 00000000..b36d6a6b --- /dev/null +++ b/tools/js-asset-auditor/audit.mjs @@ -0,0 +1,218 @@ +#!/usr/bin/env node + +// JS Asset Auditor CLI +// +// Standalone Playwright-based tool that sweeps a publisher page for third-party +// JS assets and generates js-assets.toml entries. Fully deterministic — no LLM +// involvement. +// +// Usage: +// node tools/js-asset-auditor/audit.mjs https://www.publisher.com [options] +// +// Options: +// --diff Compare against existing js-assets.toml +// --settle Settle window after page load (default: 6000) +// --first-party Additional first-party hosts (comma-separated) +// --no-filter Bypass heuristic filtering +// --headed Run browser visibly for debugging +// --output Output file path (default: js-assets.toml) +// +// Prerequisites: +// cd tools/js-asset-auditor && npm install && npx playwright install chromium + +import { readFileSync, writeFileSync } from "node:fs"; +import { resolve } from "node:path"; +import { processAssets } from "../../scripts/audit-js-assets.mjs"; + +// --------------------------------------------------------------------------- +// Config reading +// --------------------------------------------------------------------------- + +function readPublisherDomain(repoRoot) { + const content = readFileSync( + resolve(repoRoot, "trusted-server.toml"), + "utf-8", + ); + const lines = content.split("\n"); + let inPublisher = false; + for (const line of lines) { + if (/^\[publisher\]/.test(line)) { + inPublisher = true; + continue; + } + if (/^\[/.test(line)) { + inPublisher = false; + continue; + } + if (inPublisher) { + const m = line.match(/^domain\s*=\s*"([^"]+)"/); + if (m) return m[1]; + } + } + throw new Error( + "Could not find [publisher].domain in trusted-server.toml", + ); +} + +// --------------------------------------------------------------------------- +// CLI argument parsing +// --------------------------------------------------------------------------- + +function parseArgs(argv) { + const args = { + url: null, + diff: false, + settle: 6000, + firstParty: [], + noFilter: false, + headed: false, + output: "js-assets.toml", + }; + + for (let i = 2; i < argv.length; i++) { + const arg = argv[i]; + if (arg === "--diff") { + args.diff = true; + } else if (arg === "--settle") { + args.settle = parseInt(argv[++i], 10); + } else if (arg === "--first-party") { + args.firstParty = argv[++i].split(",").filter(Boolean); + } else if (arg === "--no-filter") { + args.noFilter = true; + } else if (arg === "--headed") { + args.headed = true; + } else if (arg === "--output") { + args.output = argv[++i]; + } else if (!arg.startsWith("--") && !args.url) { + // Positional argument: the URL + args.url = arg.startsWith("http") ? arg : `https://${arg}`; + } else { + console.error(`Unknown argument: ${arg}`); + process.exit(1); + } + } + + if (!args.url) { + console.error( + "Usage: node tools/js-asset-auditor/audit.mjs [--diff] [--settle ] [--first-party ] [--no-filter] [--headed] [--output ]", + ); + process.exit(1); + } + + return args; +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +async function main() { + const args = parseArgs(process.argv); + const repoRoot = process.cwd(); + + // Read publisher domain from config + let domain; + try { + domain = readPublisherDomain(repoRoot); + } catch (err) { + console.error(err.message); + process.exit(1); + } + + // Import Playwright + let chromium; + try { + ({ chromium } = await import("playwright")); + } catch { + console.error( + "Playwright not installed. Run:\n cd tools/js-asset-auditor && npm install", + ); + process.exit(1); + } + + // Launch browser + console.error(`Launching browser...`); + let browser; + try { + browser = await chromium.launch({ headless: !args.headed }); + } catch (err) { + if (err.message.includes("Executable doesn't exist")) { + console.error( + "Chromium not installed. Run:\n cd tools/js-asset-auditor && npx playwright install chromium", + ); + process.exit(1); + } + throw err; + } + + try { + const context = await browser.newContext(); + const page = await context.newPage(); + + // Collect script network requests + const scriptUrls = []; + page.on("response", (response) => { + const req = response.request(); + if (req.resourceType() === "script") { + scriptUrls.push(req.url()); + } + }); + + // Navigate + console.error(`Navigating to ${args.url}...`); + await page.goto(args.url, { waitUntil: "load", timeout: 30000 }); + + // Settle + console.error(`Waiting ${args.settle}ms for page to settle...`); + await page.waitForTimeout(args.settle); + + // Collect head scripts from DOM + const headScriptUrls = await page.evaluate(() => + Array.from( + document.head.querySelectorAll("script[src]"), + ).map((s) => s.src), + ); + + console.error( + `Found ${scriptUrls.length} network scripts, ${headScriptUrls.length} head scripts`, + ); + + await browser.close(); + + // Process + console.error("Processing assets..."); + const result = processAssets( + { networkUrls: scriptUrls, headUrls: headScriptUrls }, + { + domain, + target: args.url, + output: args.output, + diff: args.diff, + firstParty: args.firstParty, + noFilter: args.noFilter, + }, + ); + + if (result.error) { + console.error(result.error); + process.exit(1); + } + + // Write output + writeFileSync(args.output, result.toml); + const count = + result.summary.mode === "init" + ? result.summary.surfaced + : result.summary.new.length; + console.error(`Wrote ${args.output} (${count} entries)`); + + // Print JSON summary to stdout + console.log(JSON.stringify(result.summary)); + } finally { + if (browser.isConnected()) { + await browser.close(); + } + } +} + +main(); diff --git a/tools/js-asset-auditor/package.json b/tools/js-asset-auditor/package.json new file mode 100644 index 00000000..d5349ab8 --- /dev/null +++ b/tools/js-asset-auditor/package.json @@ -0,0 +1,9 @@ +{ + "name": "js-asset-auditor", + "version": "1.0.0", + "private": true, + "type": "module", + "dependencies": { + "playwright": "^1.58.0" + } +} From 7a695226b312390c15a5a84680c307ea76e4eb8e Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 13 Apr 2026 12:00:00 -0500 Subject: [PATCH 06/12] Update JS Asset Auditor spec to reflect Playwright CLI architecture Rewrite sweep protocol, implementation, and verification sections to describe the three-component architecture: Playwright CLI, processing library, and Claude Code skill wrapper. Add direct CLI invocation examples, --headed flag, first-party auto-detection verification, and ad-rendering filter verification steps. --- .../2026-04-01-js-asset-auditor-design.md | 74 ++++++++++--------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md index 6bbb567f..28a1f638 100644 --- a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md +++ b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md @@ -10,40 +10,47 @@ The JS Asset Proxy requires a `js-assets.toml` file declaring which third-party JS assets to proxy. Without tooling, populating this file requires manually inspecting network requests in browser DevTools, extracting URLs, generating opaque slugs, and writing TOML — a tedious error-prone process that is a barrier to publisher onboarding. -The Auditor eliminates this friction. It sweeps a publisher's page using the Chrome DevTools MCP, detects third-party JS assets, auto-generates `js-assets.toml` entries, and auto-detects `inject_in_head` from the page DOM. The operator's only remaining decision is reviewing the output before committing. +The Auditor eliminates this friction. It sweeps a publisher's page using Playwright (headless Chromium), detects third-party JS assets, auto-generates `js-assets.toml` entries, and auto-detects `inject_in_head` from the page DOM. The operator's only remaining decision is reviewing the output before committing. It also runs as a monitoring tool — `--diff` mode compares a new sweep against the existing config and surfaces new or removed assets, giving publishers ongoing visibility into their third-party JS footprint. -**Implementation:** Pure Claude Code skill — no Rust, no compiled code, no additional dependencies. Uses the Chrome DevTools MCP already configured in `.claude/settings.json`. +**Implementation:** Standalone Playwright CLI (`tools/js-asset-auditor/audit.mjs`) backed by a shared processing library (`scripts/audit-js-assets.mjs`). No Rust, no compiled code. The Claude Code skill (`/audit-js-assets`) is a thin wrapper that invokes the CLI. --- ## Command Interface ```bash +# Via Claude Code skill /audit-js-assets https://www.publisher.com # init — generate js-assets.toml /audit-js-assets https://www.publisher.com --diff # diff — compare against existing file /audit-js-assets https://www.publisher.com --settle 15000 # longer settle for ad-tech-heavy pages /audit-js-assets https://www.publisher.com --no-filter # bypass heuristic filtering +/audit-js-assets https://www.publisher.com --headed # visible browser for debugging + +# Direct CLI invocation (no Claude Code required) +node tools/js-asset-auditor/audit.mjs https://www.publisher.com +node tools/js-asset-auditor/audit.mjs https://www.publisher.com --diff --output js-assets.toml ``` --- ## Sweep Protocol +The CLI (`tools/js-asset-auditor/audit.mjs`) performs the full sweep: + 1. Read `trusted-server.toml` → extract `publisher.domain` (defines first-party boundary) -2. Open Chrome via `mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page`, navigate to target URL via `mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page` -3. Wait for page load settle: `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` with `await new Promise(r => setTimeout(r, SETTLE_MS))` where `SETTLE_MS` defaults to 6000 (configurable via `--settle `) -4. In parallel: - - `mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests` with `resourceTypes: ["script"]` → post-filter to exclude first-party hosts (see URL Processing below) - - `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` → `Array.from(document.head.querySelectorAll('script[src]')).map(s => s.src)` → collect head-loaded script URLs -5. Apply URL normalization (see below), then heuristic filter (see below) -6. For each surviving asset, generate a `[[js_assets]]` entry (see below) -7. Write output (init or diff mode) -8. Print terminal summary -9. Close page via `mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page` - -**`inject_in_head` semantics:** The DOM snapshot in step 4 captures the final state of `` after the settle window. Scripts that were briefly inserted and then removed by a loader will not appear. This is intentional — `inject_in_head = true` means "the script is present in `` at page-stable state." If a loader removes it before the snapshot, the proxy should not re-inject it. +2. Launch headless Chromium via Playwright (visible with `--headed`) +3. Register a response listener for `resourceType() === 'script'` to capture all script network requests +4. Navigate to target URL (`page.goto`, 30s timeout, follows redirects transparently) +5. Wait for page load settle: `page.waitForTimeout(SETTLE_MS)` where `SETTLE_MS` defaults to 6000 (configurable via `--settle `) +6. Evaluate `document.head.querySelectorAll('script[src]')` to collect head-loaded script URLs +7. Close browser +8. Pass collected URLs to `processAssets()` from `scripts/audit-js-assets.mjs` — applies URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation +9. Write `js-assets.toml` output (init or diff mode) +10. Print JSON summary to stdout (progress lines go to stderr) + +**`inject_in_head` semantics:** The DOM snapshot in step 6 captures the final state of `` after the settle window. Scripts that were briefly inserted and then removed by a loader will not appear. This is intentional — `inject_in_head = true` means "the script is present in `` at page-stable state." If a loader removes it before the snapshot, the proxy should not re-inject it. --- @@ -124,7 +131,7 @@ The pipe (`|`) separator is required — it cannot appear in domain names or at **Rationale:** Fully opaque and hash-derived — no human naming required, no ambiguity for cryptic vendor filenames. The KV metadata (`origin_url`, `content_type`, `asset_slug`) serves as the lookup table. Operators can query `js-asset:{slug}` in the KV store to retrieve full provenance. The terminal summary also prints slug → origin_url at generation time. -**Important:** This algorithm must produce identical output to the Proxy's KV key derivation. Engineering should implement this as a shared utility (e.g., a small JS/TS helper in the skill, or a standalone `scripts/` utility) rather than duplicating the logic. +**Important:** This algorithm must produce identical output to the Proxy's KV key derivation. The reference implementation lives in `scripts/js-asset-slug.mjs` (standalone CLI) and is duplicated in `scripts/audit-js-assets.mjs` (processing library). Any changes must be synchronized across both files and the Rust proxy. ### Wildcard detection @@ -223,30 +230,28 @@ Missing: 1 asset no longer seen on page ⚠ ## Implementation -The Auditor has two components: +The Auditor has three components: -1. **Skill file** (`.claude/commands/audit-js-assets.md`) — Drives the browser via Chrome DevTools MCP, collects raw script URLs, and calls the processing script. ~100 lines. -2. **Processing script** (`scripts/audit-js-assets.mjs`) — Pure Node.js script (no external dependencies) that performs URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation, and TOML formatting. Takes raw data on stdin (JSON with `networkUrls` and `headUrls`), writes TOML to a file, and prints a JSON summary to stdout. +1. **Playwright CLI** (`tools/js-asset-auditor/audit.mjs`) — Standalone Node.js script that launches headless Chromium via Playwright, navigates to the target URL, collects script network requests and head script DOM state, then calls the processing library. Outputs TOML file + JSON summary. Can be run directly without Claude Code. +2. **Processing library** (`scripts/audit-js-assets.mjs`) — Pure Node.js module (no external dependencies) that exports `processAssets()` and individual utility functions. Handles URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation, and TOML formatting. Also usable as a standalone stdin-based CLI for integration with other tools. +3. **Claude Code skill** (`.claude/commands/audit-js-assets.md`) — Thin wrapper (~60 lines) that invokes the Playwright CLI via Bash and formats the JSON summary as a terminal report. No browser automation logic. -This split ensures deterministic, testable processing (the script) while keeping browser automation in the LLM's domain (the skill). +This architecture ensures fully deterministic, testable execution (the CLI) with an optional LLM-powered interface (the skill). -**MCP tools used:** +**Dependencies:** -- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page` — open browser tab -- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page` — load publisher URL -- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests` — capture JS requests -- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__evaluate_script` — settle window + detect head-loaded scripts via DOM query -- `mcp__plugin_chrome-devtools-mcp_chrome-devtools__close_page` — clean up tab +- `playwright` — installed in `tools/js-asset-auditor/` (`npm install`). Chromium binaries are cached in `~/Library/Caches/ms-playwright/` and shared with `crates/integration-tests/browser/`. **Standalone utilities:** -- `scripts/js-asset-slug.mjs` — Standalone slug generator for individual URLs -- `scripts/audit-js-assets.mjs` — Full audit processing pipeline +- `scripts/js-asset-slug.mjs` — Standalone slug generator for individual URLs (stdlib-only, no external dependencies) +- `scripts/audit-js-assets.mjs` — Processing library + stdin-based CLI (stdlib-only, no external dependencies) -**File tools used:** +**Setup (one-time):** -- `Read` — read `trusted-server.toml` (publisher domain) -- `Write` — write input JSON for processing script +```bash +cd tools/js-asset-auditor && npm install && npx playwright install chromium +``` --- @@ -260,12 +265,15 @@ See [delivery order in the Proxy spec](2026-04-01-js-asset-proxy-design.md) _(on ## Verification -- Run `/audit-js-assets https://www.publisher.com` against a known test publisher page with identified third-party JS +- Run `node tools/js-asset-auditor/audit.mjs https://www.publisher.com` against a known test publisher page - Verify generated entries match actual third-party JS observed on the page (cross-check in browser DevTools) - Verify `inject_in_head = true` only for scripts that appear in `` (not ``) -- Verify wildcard detection fires for versioned path segments and not for stable paths +- Verify wildcard detection fires for versioned path segments (e.g., `1.19.13-0fnlww`) and not for stable paths - Verify GTM (`googletagmanager.com`) is captured and not filtered -- Verify framework CDNs (`cdnjs.cloudflare.com` etc.) are filtered with reason in summary +- Verify Google ad rendering infra (`pagead2.googlesyndication.com`, `s0.2mdn.net` etc.) is filtered with reason in summary +- Verify `securepubads.g.doubleclick.net` (GPT) is **not** filtered +- Verify first-party auto-detection: auditing `golf.com` with `publisher.domain = "test-publisher.com"` excludes `golf.com` scripts - Run `--diff` against an unchanged page → all entries confirmed, no new/missing - Run `--diff` after adding a new vendor script to the page → appears as `NEW` in summary - Run `--diff` after removing a script → appears as `MISSING ⚠` in summary, file unchanged +- Run `/audit-js-assets ` via Claude Code skill → identical results to direct CLI invocation From f50a1988ca71065dabf353ae6b4d92e58c48747a Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 13 Apr 2026 12:51:12 -0500 Subject: [PATCH 07/12] Move JS Asset Auditor into Claude Code plugin structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructure into packages/js-asset-auditor/ as a self-contained Claude Code plugin with .claude-plugin/plugin.json manifest, skills/ directory, bin/ executable, and lib/ processing modules. The plugin provides the audit-js-assets skill and CLI automatically when enabled. Remove tools/js-asset-auditor/, scripts/audit-js-assets.mjs, and .claude/commands/audit-js-assets.md — all replaced by the plugin. --- .claude/settings.json | 2 - .gitignore | 4 +- .../.claude-plugin/plugin.json | 10 ++ packages/js-asset-auditor/bin/audit-js-assets | 11 ++ .../js-asset-auditor/lib}/audit.mjs | 38 +++--- .../js-asset-auditor/lib/process.mjs | 111 +----------------- packages/js-asset-auditor/lib/slug.mjs | 27 +++++ packages/js-asset-auditor/package-lock.json | 62 ++++++++++ .../js-asset-auditor/package.json | 3 + packages/js-asset-auditor/settings.json | 7 ++ .../skills/audit-js-assets/SKILL.md | 13 +- 11 files changed, 153 insertions(+), 135 deletions(-) create mode 100644 packages/js-asset-auditor/.claude-plugin/plugin.json create mode 100755 packages/js-asset-auditor/bin/audit-js-assets rename {tools/js-asset-auditor => packages/js-asset-auditor/lib}/audit.mjs (83%) rename scripts/audit-js-assets.mjs => packages/js-asset-auditor/lib/process.mjs (79%) create mode 100644 packages/js-asset-auditor/lib/slug.mjs create mode 100644 packages/js-asset-auditor/package-lock.json rename {tools => packages}/js-asset-auditor/package.json (70%) create mode 100644 packages/js-asset-auditor/settings.json rename .claude/commands/audit-js-assets.md => packages/js-asset-auditor/skills/audit-js-assets/SKILL.md (80%) diff --git a/.claude/settings.json b/.claude/settings.json index 8c0f3512..8a63480b 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -24,8 +24,6 @@ "Bash(git log:*)", "Bash(git status:*)", "Bash(node scripts/js-asset-slug.mjs:*)", - "Bash(node scripts/audit-js-assets.mjs:*)", - "Bash(node tools/js-asset-auditor/audit.mjs:*)", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__new_page", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__navigate_page", "mcp__plugin_chrome-devtools-mcp_chrome-devtools__list_network_requests", diff --git a/.gitignore b/.gitignore index 282288cd..f9bc0ef2 100644 --- a/.gitignore +++ b/.gitignore @@ -31,8 +31,8 @@ src/*.html /guest-profiles /benchmark-results/** -# JS Asset Auditor tool -/tools/js-asset-auditor/node_modules/ +# JS Asset Auditor plugin +/packages/js-asset-auditor/node_modules/ # Playwright browser tests /crates/integration-tests/browser/node_modules/ diff --git a/packages/js-asset-auditor/.claude-plugin/plugin.json b/packages/js-asset-auditor/.claude-plugin/plugin.json new file mode 100644 index 00000000..b6be8d85 --- /dev/null +++ b/packages/js-asset-auditor/.claude-plugin/plugin.json @@ -0,0 +1,10 @@ +{ + "name": "js-asset-auditor", + "version": "1.0.0", + "description": "Audit publisher pages for third-party JS assets and generate js-assets.toml entries using Playwright", + "author": { + "name": "StackPop" + }, + "license": "MIT", + "keywords": ["js-assets", "audit", "playwright", "ad-tech", "proxy"] +} diff --git a/packages/js-asset-auditor/bin/audit-js-assets b/packages/js-asset-auditor/bin/audit-js-assets new file mode 100755 index 00000000..cdff67ef --- /dev/null +++ b/packages/js-asset-auditor/bin/audit-js-assets @@ -0,0 +1,11 @@ +#!/usr/bin/env node + +// Plugin bin/ wrapper — resolves lib/audit.mjs relative to plugin root, +// not the user's working directory. + +import { fileURLToPath } from "node:url"; +import { dirname, resolve } from "node:path"; + +const pluginRoot = resolve(dirname(fileURLToPath(import.meta.url)), ".."); +const { main } = await import(resolve(pluginRoot, "lib/audit.mjs")); +main(); diff --git a/tools/js-asset-auditor/audit.mjs b/packages/js-asset-auditor/lib/audit.mjs similarity index 83% rename from tools/js-asset-auditor/audit.mjs rename to packages/js-asset-auditor/lib/audit.mjs index b36d6a6b..a10721d8 100644 --- a/tools/js-asset-auditor/audit.mjs +++ b/packages/js-asset-auditor/lib/audit.mjs @@ -3,11 +3,11 @@ // JS Asset Auditor CLI // // Standalone Playwright-based tool that sweeps a publisher page for third-party -// JS assets and generates js-assets.toml entries. Fully deterministic — no LLM -// involvement. +// JS assets and generates js-assets.toml entries. // // Usage: -// node tools/js-asset-auditor/audit.mjs https://www.publisher.com [options] +// node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com [options] +// audit-js-assets https://www.publisher.com [options] (when plugin bin/ is in PATH) // // Options: // --diff Compare against existing js-assets.toml @@ -18,11 +18,11 @@ // --output Output file path (default: js-assets.toml) // // Prerequisites: -// cd tools/js-asset-auditor && npm install && npx playwright install chromium +// cd packages/js-asset-auditor && npm install && npx playwright install chromium import { readFileSync, writeFileSync } from "node:fs"; import { resolve } from "node:path"; -import { processAssets } from "../../scripts/audit-js-assets.mjs"; +import { processAssets } from "./process.mjs"; // --------------------------------------------------------------------------- // Config reading @@ -84,7 +84,6 @@ function parseArgs(argv) { } else if (arg === "--output") { args.output = argv[++i]; } else if (!arg.startsWith("--") && !args.url) { - // Positional argument: the URL args.url = arg.startsWith("http") ? arg : `https://${arg}`; } else { console.error(`Unknown argument: ${arg}`); @@ -94,7 +93,7 @@ function parseArgs(argv) { if (!args.url) { console.error( - "Usage: node tools/js-asset-auditor/audit.mjs [--diff] [--settle ] [--first-party ] [--no-filter] [--headed] [--output ]", + "Usage: audit-js-assets [--diff] [--settle ] [--first-party ] [--no-filter] [--headed] [--output ]", ); process.exit(1); } @@ -106,11 +105,10 @@ function parseArgs(argv) { // Main // --------------------------------------------------------------------------- -async function main() { +export async function main() { const args = parseArgs(process.argv); const repoRoot = process.cwd(); - // Read publisher domain from config let domain; try { domain = readPublisherDomain(repoRoot); @@ -119,18 +117,16 @@ async function main() { process.exit(1); } - // Import Playwright let chromium; try { ({ chromium } = await import("playwright")); } catch { console.error( - "Playwright not installed. Run:\n cd tools/js-asset-auditor && npm install", + "Playwright not installed. Run:\n cd packages/js-asset-auditor && npm install", ); process.exit(1); } - // Launch browser console.error(`Launching browser...`); let browser; try { @@ -138,7 +134,7 @@ async function main() { } catch (err) { if (err.message.includes("Executable doesn't exist")) { console.error( - "Chromium not installed. Run:\n cd tools/js-asset-auditor && npx playwright install chromium", + "Chromium not installed. Run:\n cd packages/js-asset-auditor && npx playwright install chromium", ); process.exit(1); } @@ -149,7 +145,6 @@ async function main() { const context = await browser.newContext(); const page = await context.newPage(); - // Collect script network requests const scriptUrls = []; page.on("response", (response) => { const req = response.request(); @@ -158,15 +153,12 @@ async function main() { } }); - // Navigate console.error(`Navigating to ${args.url}...`); await page.goto(args.url, { waitUntil: "load", timeout: 30000 }); - // Settle console.error(`Waiting ${args.settle}ms for page to settle...`); await page.waitForTimeout(args.settle); - // Collect head scripts from DOM const headScriptUrls = await page.evaluate(() => Array.from( document.head.querySelectorAll("script[src]"), @@ -179,7 +171,6 @@ async function main() { await browser.close(); - // Process console.error("Processing assets..."); const result = processAssets( { networkUrls: scriptUrls, headUrls: headScriptUrls }, @@ -198,7 +189,6 @@ async function main() { process.exit(1); } - // Write output writeFileSync(args.output, result.toml); const count = result.summary.mode === "init" @@ -206,7 +196,6 @@ async function main() { : result.summary.new.length; console.error(`Wrote ${args.output} (${count} entries)`); - // Print JSON summary to stdout console.log(JSON.stringify(result.summary)); } finally { if (browser.isConnected()) { @@ -215,4 +204,11 @@ async function main() { } } -main(); +// Run when invoked directly +const isDirectExecution = + process.argv[1] && + new URL(process.argv[1], "file://").href === import.meta.url; + +if (isDirectExecution) { + main(); +} diff --git a/scripts/audit-js-assets.mjs b/packages/js-asset-auditor/lib/process.mjs similarity index 79% rename from scripts/audit-js-assets.mjs rename to packages/js-asset-auditor/lib/process.mjs index 8bc7690c..623926fc 100644 --- a/scripts/audit-js-assets.mjs +++ b/packages/js-asset-auditor/lib/process.mjs @@ -1,26 +1,14 @@ -#!/usr/bin/env node - -// JS Asset Auditor — Processing Library & CLI -// -// Provides URL processing functions (normalization, filtering, wildcard -// detection, slug generation, TOML formatting) used by both the standalone -// Playwright CLI (tools/js-asset-auditor/audit.mjs) and the stdin-based -// pipeline invoked by the Claude Code skill. +// JS Asset Auditor — Processing Library // -// CLI usage (stdin mode): -// cat input.json | node scripts/audit-js-assets.mjs \ -// --domain --target \ -// [--output js-assets.toml] [--diff] [--first-party ] [--no-filter] -// -// Stdin format: -// {"networkUrls": ["https://..."], "headUrls": ["https://..."]} +// Pure processing functions for URL normalization, filtering, wildcard +// detection, slug generation, and TOML formatting. No external dependencies. // // The slug algorithm is duplicated from scripts/js-asset-slug.mjs. Both files // must produce identical output. Any changes must be synchronized. import { createHash } from "node:crypto"; import { posix } from "node:path"; -import { readFileSync, writeFileSync } from "node:fs"; +import { readFileSync } from "node:fs"; // --------------------------------------------------------------------------- // Constants @@ -29,8 +17,6 @@ import { readFileSync, writeFileSync } from "node:fs"; const BASE62_CHARSET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; -// Heuristic filter: host-only entries use dot-boundary suffix matching. -// Entries with a `pathPrefix` also require the URL path to start with it. export const HEURISTIC_FILTERS = { "Framework CDNs": ["cdnjs.cloudflare.com", "ajax.googleapis.com", "cdn.jsdelivr.net", "unpkg.com"], "Error tracking": ["sentry.io", "bugsnag.com", "rollbar.com"], @@ -57,7 +43,7 @@ const HEX_HASH_RE = /^[a-f0-9]{8,}$/; const MIXED_HASH_RE = /^[A-Za-z0-9]{8,}$/; // --------------------------------------------------------------------------- -// Slug generation (duplicated from scripts/js-asset-slug.mjs) +// Slug generation // --------------------------------------------------------------------------- function bufferToBase62(buffer) { @@ -256,11 +242,9 @@ export function processAssets(input, args) { targetHost = args.target; } - // Normalize and deduplicate const normalizedNetwork = [...new Set(rawNetworkUrls.map(normalizeUrl))]; const normalizedHead = new Set(rawHeadUrls.map(normalizeUrl)); - // First-party filter const firstPartyFiltered = []; const thirdPartyUrls = []; @@ -278,7 +262,6 @@ export function processAssets(input, args) { } } - // Heuristic filter const heuristicFiltered = []; const survivingUrls = []; @@ -311,7 +294,6 @@ export function processAssets(input, args) { filterCounts[f.host] = (filterCounts[f.host] || 0) + 1; } - // Process surviving URLs const assets = []; const seenOrigins = new Set(); @@ -353,7 +335,6 @@ export function processAssets(input, args) { }); } - // Generate TOML and summary const today = new Date().toISOString().slice(0, 10); if (args.diff) { @@ -395,7 +376,6 @@ export function processAssets(input, args) { }; } - // Init mode let toml = `# Generated by /audit-js-assets on ${today}\n`; toml += `# Publisher: ${args.domain}\n`; toml += `# Source URL: ${args.target}\n`; @@ -429,84 +409,3 @@ export function processAssets(input, args) { }, }; } - -// --------------------------------------------------------------------------- -// CLI argument parsing (for stdin mode) -// --------------------------------------------------------------------------- - -function parseArgs(argv) { - const args = { - domain: null, - target: null, - output: "js-assets.toml", - diff: false, - firstParty: [], - noFilter: false, - }; - - for (let i = 2; i < argv.length; i++) { - switch (argv[i]) { - case "--domain": - args.domain = argv[++i]; - break; - case "--target": - args.target = argv[++i]; - break; - case "--output": - args.output = argv[++i]; - break; - case "--diff": - args.diff = true; - break; - case "--first-party": - args.firstParty = argv[++i].split(",").filter(Boolean); - break; - case "--no-filter": - args.noFilter = true; - break; - default: - console.error(`Unknown argument: ${argv[i]}`); - process.exit(1); - } - } - - if (!args.domain || !args.target) { - console.error( - "Usage: cat input.json | node scripts/audit-js-assets.mjs --domain --target [--output file] [--diff] [--first-party hosts] [--no-filter]", - ); - process.exit(1); - } - - return args; -} - -// --------------------------------------------------------------------------- -// CLI entry point (stdin mode) — only runs when invoked directly -// --------------------------------------------------------------------------- - -async function main() { - const args = parseArgs(process.argv); - - const chunks = []; - for await (const chunk of process.stdin) chunks.push(chunk); - const input = JSON.parse(Buffer.concat(chunks).toString()); - - const result = processAssets(input, args); - - if (result.error) { - console.error(result.error); - process.exit(1); - } - - writeFileSync(args.output, result.toml); - console.log(JSON.stringify(result.summary)); -} - -// Only run main() when this file is executed directly, not when imported -const isDirectExecution = - process.argv[1] && - new URL(process.argv[1], "file://").href === import.meta.url; - -if (isDirectExecution) { - main(); -} diff --git a/packages/js-asset-auditor/lib/slug.mjs b/packages/js-asset-auditor/lib/slug.mjs new file mode 100644 index 00000000..fcc0c7c0 --- /dev/null +++ b/packages/js-asset-auditor/lib/slug.mjs @@ -0,0 +1,27 @@ +#!/usr/bin/env node + +// JS Asset Slug Generator +// +// Shared utility for generating deterministic slugs for js-assets.toml entries. +// Must produce identical output to the Rust proxy's KV key derivation. +// +// Algorithm: +// publisher_prefix = first_8_chars(base62(sha256(domain + "|" + url))) +// asset_stem = filename_without_extension(url) +// slug = "{publisher_prefix}:{asset_stem}" +// +// Usage: +// node packages/js-asset-auditor/lib/slug.mjs + +import { generateSlug } from "./process.mjs"; + +const [publisherDomain, originUrl] = process.argv.slice(2); + +if (!publisherDomain || !originUrl) { + console.error( + "Usage: node packages/js-asset-auditor/lib/slug.mjs ", + ); + process.exit(1); +} + +console.log(generateSlug(publisherDomain, originUrl)); diff --git a/packages/js-asset-auditor/package-lock.json b/packages/js-asset-auditor/package-lock.json new file mode 100644 index 00000000..080a7ad9 --- /dev/null +++ b/packages/js-asset-auditor/package-lock.json @@ -0,0 +1,62 @@ +{ + "name": "js-asset-auditor", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "js-asset-auditor", + "version": "1.0.0", + "dependencies": { + "playwright": "^1.58.0" + }, + "bin": { + "audit-js-assets": "bin/audit-js-assets" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/playwright": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz", + "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz", + "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + } + } +} diff --git a/tools/js-asset-auditor/package.json b/packages/js-asset-auditor/package.json similarity index 70% rename from tools/js-asset-auditor/package.json rename to packages/js-asset-auditor/package.json index d5349ab8..044f4214 100644 --- a/tools/js-asset-auditor/package.json +++ b/packages/js-asset-auditor/package.json @@ -3,6 +3,9 @@ "version": "1.0.0", "private": true, "type": "module", + "bin": { + "audit-js-assets": "./bin/audit-js-assets" + }, "dependencies": { "playwright": "^1.58.0" } diff --git a/packages/js-asset-auditor/settings.json b/packages/js-asset-auditor/settings.json new file mode 100644 index 00000000..c17ba006 --- /dev/null +++ b/packages/js-asset-auditor/settings.json @@ -0,0 +1,7 @@ +{ + "permissions": { + "allow": [ + "Bash(audit-js-assets:*)" + ] + } +} diff --git a/.claude/commands/audit-js-assets.md b/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md similarity index 80% rename from .claude/commands/audit-js-assets.md rename to packages/js-asset-auditor/skills/audit-js-assets/SKILL.md index d088c7c0..fe9f190f 100644 --- a/.claude/commands/audit-js-assets.md +++ b/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md @@ -1,6 +1,11 @@ +--- +name: audit-js-assets +description: Audit a publisher page for third-party JavaScript assets. Use when analyzing external scripts, generating js-assets.toml entries, or monitoring changes to a publisher's JS footprint. +--- + Audit a publisher page for third-party JS assets and generate `js-assets.toml` entries. -Usage: /audit-js-assets $ARGUMENTS +Usage: /js-asset-auditor:audit-js-assets $ARGUMENTS `$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...] [--no-filter] [--headed]` @@ -13,7 +18,7 @@ Follow these steps exactly. Stop and report if any step fails. Run the Playwright CLI via Bash, forwarding all arguments from `$ARGUMENTS`: ```bash -node tools/js-asset-auditor/audit.mjs $ARGUMENTS +audit-js-assets $ARGUMENTS ``` The CLI reads `trusted-server.toml` for the publisher domain, opens a headless browser, collects script URLs, processes them, and writes `js-assets.toml`. Progress lines appear on stderr; a JSON summary prints to stdout. @@ -21,7 +26,7 @@ The CLI reads `trusted-server.toml` for the publisher domain, opens a headless b If the command fails with "Playwright not installed" or "Chromium not installed", tell the user to run: ```bash -cd tools/js-asset-auditor && npm install && npx playwright install chromium +cd packages/js-asset-auditor && npm install && npx playwright install chromium ``` ## 2. Show results @@ -42,7 +47,7 @@ Surfaced: {surfaced} assets → js-assets.toml ... Review inject_in_head values and commit js-assets.toml when ready. -Diff mode: /audit-js-assets --diff +Diff mode: /js-asset-auditor:audit-js-assets --diff ``` ### Diff mode From 36535a56d53c55ecf1264299438d3243239604ca Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 13 Apr 2026 12:53:59 -0500 Subject: [PATCH 08/12] Add plugin marketplace index for js-asset-auditor Enables installing the JS Asset Auditor plugin from this repo via /plugin marketplace add /trusted-server followed by /plugin install js-asset-auditor. --- .claude-plugin/marketplace.json | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .claude-plugin/marketplace.json diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json new file mode 100644 index 00000000..ddde0dbc --- /dev/null +++ b/.claude-plugin/marketplace.json @@ -0,0 +1,9 @@ +{ + "plugins": [ + { + "name": "js-asset-auditor", + "description": "Audit publisher pages for third-party JS assets and generate js-assets.toml entries using Playwright", + "path": "packages/js-asset-auditor" + } + ] +} From bbf3b1b02497f29f9432841cbd7de492b5a947fc Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 13 Apr 2026 13:03:42 -0500 Subject: [PATCH 09/12] Make publisher domain optional in JS Asset Auditor CLI Add --domain flag and fall back to inferring from the target URL when trusted-server.toml is not present. Enables using the plugin in any project without project-specific config. --- packages/js-asset-auditor/lib/audit.mjs | 26 ++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/packages/js-asset-auditor/lib/audit.mjs b/packages/js-asset-auditor/lib/audit.mjs index a10721d8..415f54d8 100644 --- a/packages/js-asset-auditor/lib/audit.mjs +++ b/packages/js-asset-auditor/lib/audit.mjs @@ -61,6 +61,7 @@ function readPublisherDomain(repoRoot) { function parseArgs(argv) { const args = { url: null, + domain: null, diff: false, settle: 6000, firstParty: [], @@ -71,7 +72,9 @@ function parseArgs(argv) { for (let i = 2; i < argv.length; i++) { const arg = argv[i]; - if (arg === "--diff") { + if (arg === "--domain") { + args.domain = argv[++i]; + } else if (arg === "--diff") { args.diff = true; } else if (arg === "--settle") { args.settle = parseInt(argv[++i], 10); @@ -109,12 +112,21 @@ export async function main() { const args = parseArgs(process.argv); const repoRoot = process.cwd(); - let domain; - try { - domain = readPublisherDomain(repoRoot); - } catch (err) { - console.error(err.message); - process.exit(1); + // Resolve publisher domain: --domain flag > trusted-server.toml > infer from URL + let domain = args.domain; + if (!domain) { + try { + domain = readPublisherDomain(repoRoot); + } catch { + // No config file — infer from target URL + try { + const host = new URL(args.url).hostname; + domain = host.startsWith("www.") ? host.slice(4) : host; + } catch { + domain = args.url; + } + console.error(`No trusted-server.toml found, using domain: ${domain}`); + } } let chromium; From 3473fa7c6ad80012c7d0f262bd40e46f2e3daf10 Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 13 Apr 2026 13:10:05 -0500 Subject: [PATCH 10/12] Update JS Asset Auditor spec for plugin structure and optional domain Reflect the plugin layout at packages/js-asset-auditor/, update all file paths, document the domain resolution fallback chain (--domain flag > trusted-server.toml > infer from URL), and update skill invocation to use the namespaced /js-asset-auditor:audit-js-assets format. --- .../2026-04-01-js-asset-auditor-design.md | 83 ++++++++++++------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md index 28a1f638..f0c2d59e 100644 --- a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md +++ b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md @@ -14,39 +14,40 @@ The Auditor eliminates this friction. It sweeps a publisher's page using Playwri It also runs as a monitoring tool — `--diff` mode compares a new sweep against the existing config and surfaces new or removed assets, giving publishers ongoing visibility into their third-party JS footprint. -**Implementation:** Standalone Playwright CLI (`tools/js-asset-auditor/audit.mjs`) backed by a shared processing library (`scripts/audit-js-assets.mjs`). No Rust, no compiled code. The Claude Code skill (`/audit-js-assets`) is a thin wrapper that invokes the CLI. +**Implementation:** Claude Code plugin at `packages/js-asset-auditor/` containing a standalone Playwright CLI, a processing library, and a skill definition. No Rust, no compiled code. Can also be run directly without Claude Code. --- ## Command Interface ```bash -# Via Claude Code skill -/audit-js-assets https://www.publisher.com # init — generate js-assets.toml -/audit-js-assets https://www.publisher.com --diff # diff — compare against existing file -/audit-js-assets https://www.publisher.com --settle 15000 # longer settle for ad-tech-heavy pages -/audit-js-assets https://www.publisher.com --no-filter # bypass heuristic filtering -/audit-js-assets https://www.publisher.com --headed # visible browser for debugging +# Via Claude Code plugin skill +/js-asset-auditor:audit-js-assets https://www.publisher.com # init — generate js-assets.toml +/js-asset-auditor:audit-js-assets https://www.publisher.com --diff # diff — compare against existing file +/js-asset-auditor:audit-js-assets https://www.publisher.com --settle 15000 # longer settle for ad-tech-heavy pages +/js-asset-auditor:audit-js-assets https://www.publisher.com --no-filter # bypass heuristic filtering +/js-asset-auditor:audit-js-assets https://www.publisher.com --headed # visible browser for debugging # Direct CLI invocation (no Claude Code required) -node tools/js-asset-auditor/audit.mjs https://www.publisher.com -node tools/js-asset-auditor/audit.mjs https://www.publisher.com --diff --output js-assets.toml +node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com +node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com --domain publisher.com +node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com --diff --output js-assets.toml ``` --- ## Sweep Protocol -The CLI (`tools/js-asset-auditor/audit.mjs`) performs the full sweep: +The CLI (`packages/js-asset-auditor/lib/audit.mjs`) performs the full sweep: -1. Read `trusted-server.toml` → extract `publisher.domain` (defines first-party boundary) +1. Resolve publisher domain: `--domain` flag → `trusted-server.toml` → infer from target URL 2. Launch headless Chromium via Playwright (visible with `--headed`) 3. Register a response listener for `resourceType() === 'script'` to capture all script network requests 4. Navigate to target URL (`page.goto`, 30s timeout, follows redirects transparently) 5. Wait for page load settle: `page.waitForTimeout(SETTLE_MS)` where `SETTLE_MS` defaults to 6000 (configurable via `--settle `) 6. Evaluate `document.head.querySelectorAll('script[src]')` to collect head-loaded script URLs 7. Close browser -8. Pass collected URLs to `processAssets()` from `scripts/audit-js-assets.mjs` — applies URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation +8. Pass collected URLs to `processAssets()` from `lib/process.mjs` — applies URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation 9. Write `js-assets.toml` output (init or diff mode) 10. Print JSON summary to stdout (progress lines go to stderr) @@ -58,11 +59,13 @@ The CLI (`tools/js-asset-auditor/audit.mjs`) performs the full sweep: ### First-party boundary -A network request is **first-party** if the request URL's host, after stripping a leading `www.`, matches `publisher.domain` (from `trusted-server.toml`) after the same stripping. Matching is exact on the resulting strings. +A network request is **first-party** if the request URL's host, after stripping a leading `www.`, matches the publisher domain after the same stripping. Matching is exact on the resulting strings. -Publisher-owned CDN subdomains (e.g., `cdn.publisher.com`, `static.publisher.com`) are treated as third-party by default. If the publisher wants to exclude them, they can be added to a `first_party_hosts` list in the command invocation (e.g., `--first-party cdn.publisher.com`). +**Domain resolution order:** `--domain ` flag → `publisher.domain` from `trusted-server.toml` → inferred from the target URL's hostname. This makes the tool usable in any project — `trusted-server.toml` is not required. -**Auto-detection:** The target URL's hostname is automatically included as first-party, in addition to `publisher.domain` from `trusted-server.toml`. This ensures that auditing `https://golf.com` when `publisher.domain = "test-publisher.com"` correctly excludes `golf.com` scripts without requiring `--first-party golf.com`. +**Auto-detection:** The target URL's hostname is automatically included as first-party, in addition to the resolved publisher domain. This ensures that auditing `https://golf.com` when `publisher.domain = "test-publisher.com"` correctly excludes `golf.com` scripts without requiring `--first-party golf.com`. + +Publisher-owned CDN subdomains (e.g., `cdn.publisher.com`, `static.publisher.com`) are treated as third-party by default. If the publisher wants to exclude them, they can be added via `--first-party cdn.publisher.com`. ### URL normalization @@ -131,7 +134,7 @@ The pipe (`|`) separator is required — it cannot appear in domain names or at **Rationale:** Fully opaque and hash-derived — no human naming required, no ambiguity for cryptic vendor filenames. The KV metadata (`origin_url`, `content_type`, `asset_slug`) serves as the lookup table. Operators can query `js-asset:{slug}` in the KV store to retrieve full provenance. The terminal summary also prints slug → origin_url at generation time. -**Important:** This algorithm must produce identical output to the Proxy's KV key derivation. The reference implementation lives in `scripts/js-asset-slug.mjs` (standalone CLI) and is duplicated in `scripts/audit-js-assets.mjs` (processing library). Any changes must be synchronized across both files and the Rust proxy. +**Important:** This algorithm must produce identical output to the Proxy's KV key derivation. The reference implementation lives in `packages/js-asset-auditor/lib/slug.mjs` (standalone CLI) and `packages/js-asset-auditor/lib/process.mjs` (processing library), with a copy in `scripts/js-asset-slug.mjs`. Any changes must be synchronized across all files and the Rust proxy. ### Wildcard detection @@ -230,29 +233,46 @@ Missing: 1 asset no longer seen on page ⚠ ## Implementation -The Auditor has three components: - -1. **Playwright CLI** (`tools/js-asset-auditor/audit.mjs`) — Standalone Node.js script that launches headless Chromium via Playwright, navigates to the target URL, collects script network requests and head script DOM state, then calls the processing library. Outputs TOML file + JSON summary. Can be run directly without Claude Code. -2. **Processing library** (`scripts/audit-js-assets.mjs`) — Pure Node.js module (no external dependencies) that exports `processAssets()` and individual utility functions. Handles URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation, and TOML formatting. Also usable as a standalone stdin-based CLI for integration with other tools. -3. **Claude Code skill** (`.claude/commands/audit-js-assets.md`) — Thin wrapper (~60 lines) that invokes the Playwright CLI via Bash and formats the JSON summary as a terminal report. No browser automation logic. +The Auditor is packaged as a Claude Code plugin at `packages/js-asset-auditor/` with three components: -This architecture ensures fully deterministic, testable execution (the CLI) with an optional LLM-powered interface (the skill). +``` +packages/js-asset-auditor/ +├── .claude-plugin/plugin.json # Plugin manifest +├── skills/audit-js-assets/SKILL.md # Skill definition +├── bin/audit-js-assets # Executable (added to PATH by Claude Code) +├── lib/ +│ ├── audit.mjs # Playwright CLI — browser automation + orchestration +│ ├── process.mjs # Processing library — normalization, filtering, slugs, TOML +│ └── slug.mjs # Standalone slug generator +├── package.json # playwright dependency +└── settings.json # Auto-grants Bash(audit-js-assets:*) permission +``` -**Dependencies:** +1. **Playwright CLI** (`lib/audit.mjs`) — Launches headless Chromium, navigates to the target URL, collects script network requests and head script DOM state, then calls `processAssets()`. Outputs TOML file + JSON summary. Can be run directly without Claude Code. +2. **Processing library** (`lib/process.mjs`) — Pure Node.js module (no external dependencies) that exports `processAssets()` and individual utility functions. Handles URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation, and TOML formatting. +3. **Claude Code skill** (`skills/audit-js-assets/SKILL.md`) — Thin wrapper that invokes the CLI via the `bin/audit-js-assets` executable and formats the JSON summary. -- `playwright` — installed in `tools/js-asset-auditor/` (`npm install`). Chromium binaries are cached in `~/Library/Caches/ms-playwright/` and shared with `crates/integration-tests/browser/`. +**Plugin installation:** -**Standalone utilities:** +```bash +# Local testing (loads for one session) +claude --plugin-dir packages/js-asset-auditor -- `scripts/js-asset-slug.mjs` — Standalone slug generator for individual URLs (stdlib-only, no external dependencies) -- `scripts/audit-js-assets.mjs` — Processing library + stdin-based CLI (stdlib-only, no external dependencies) +# Via marketplace (permanent installation) +/plugin marketplace add / +/plugin install js-asset-auditor +``` -**Setup (one-time):** +**Setup (one-time after install):** ```bash -cd tools/js-asset-auditor && npm install && npx playwright install chromium +cd packages/js-asset-auditor && npm install && npx playwright install chromium ``` +**Standalone utilities:** + +- `scripts/js-asset-slug.mjs` — Standalone slug generator for individual URLs (kept outside the plugin for backward compatibility) + --- ## Delivery Order @@ -265,7 +285,7 @@ See [delivery order in the Proxy spec](2026-04-01-js-asset-proxy-design.md) _(on ## Verification -- Run `node tools/js-asset-auditor/audit.mjs https://www.publisher.com` against a known test publisher page +- Run `node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com` against a known test publisher page - Verify generated entries match actual third-party JS observed on the page (cross-check in browser DevTools) - Verify `inject_in_head = true` only for scripts that appear in `` (not ``) - Verify wildcard detection fires for versioned path segments (e.g., `1.19.13-0fnlww`) and not for stable paths @@ -276,4 +296,5 @@ See [delivery order in the Proxy spec](2026-04-01-js-asset-proxy-design.md) _(on - Run `--diff` against an unchanged page → all entries confirmed, no new/missing - Run `--diff` after adding a new vendor script to the page → appears as `NEW` in summary - Run `--diff` after removing a script → appears as `MISSING ⚠` in summary, file unchanged -- Run `/audit-js-assets ` via Claude Code skill → identical results to direct CLI invocation +- Run `/js-asset-auditor:audit-js-assets ` via Claude Code plugin → identical results to direct CLI invocation +- Run CLI without `trusted-server.toml` (using `--domain` or domain inference) → works in any project From ba1a8c296a5292f6bf3671126153c7770db4399a Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 13 Apr 2026 13:29:19 -0500 Subject: [PATCH 11/12] Add integration detection and config generation to JS Asset Auditor New --config [path] flag auto-detects integrations (GPT, GTM, Didomi, DataDome, Lockr, Permutive, Prebid, APS) from swept script URLs and generates a trusted-server.toml with appropriate [integrations.*] sections. Auto-extracts fields like GTM container_id from query params and Permutive org/workspace IDs from URL paths. Fields needing manual input are marked with TODO comments. --- .../2026-04-01-js-asset-auditor-design.md | 66 +++++ packages/js-asset-auditor/lib/audit.mjs | 55 +++- packages/js-asset-auditor/lib/detect.mjs | 278 ++++++++++++++++++ .../skills/audit-js-assets/SKILL.md | 17 +- 4 files changed, 414 insertions(+), 2 deletions(-) create mode 100644 packages/js-asset-auditor/lib/detect.mjs diff --git a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md index f0c2d59e..348a00bc 100644 --- a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md +++ b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md @@ -27,11 +27,13 @@ It also runs as a monitoring tool — `--diff` mode compares a new sweep against /js-asset-auditor:audit-js-assets https://www.publisher.com --settle 15000 # longer settle for ad-tech-heavy pages /js-asset-auditor:audit-js-assets https://www.publisher.com --no-filter # bypass heuristic filtering /js-asset-auditor:audit-js-assets https://www.publisher.com --headed # visible browser for debugging +/js-asset-auditor:audit-js-assets https://www.publisher.com --config # also generate trusted-server.toml # Direct CLI invocation (no Claude Code required) node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com --domain publisher.com node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com --diff --output js-assets.toml +node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com --config my-config.toml ``` --- @@ -231,6 +233,64 @@ Missing: 1 asset no longer seen on page ⚠ --- +## Integration Detection & Config Generation + +When invoked with `--config [path]`, the CLI also detects known integrations from the swept URLs and generates a `trusted-server.toml` with appropriate `[integrations.*]` sections. + +### Detection patterns + +Integration detection runs on raw URLs (before normalization) to preserve query parameters needed for field extraction. + +| URL Pattern | Integration | Extracted Fields | +| -------------------------------------------------- | ---------------------- | ----------------------------------------- | +| `securepubads.g.doubleclick.net/tag/js/gpt*` | `gpt` | `script_url` | +| `www.googletagmanager.com/gtm.js?id=GTM-XXX` | `google_tag_manager` | `container_id` from `?id=` | +| `sdk.privacy-center.org` | `didomi` | (defaults) | +| `js.datadome.co` | `datadome` | (defaults) | +| `aim.loc.kr/*identity-lockr*.js` | `lockr` | `sdk_url` | +| `*.edge.permutive.app/*-web.js` | `permutive` | `organization_id`, `workspace_id` from URL | +| `*/prebid.js`, `*/prebidjs.js` (+ .min variants) | `prebid` | (detect only) | +| `c.amazon-adsystem.com/aax2/apstag*` | `aps` | (detect only) | + +### Field categories + +- **Full** — all config fields have defaults or are auto-extracted. Config section is ready to use. +- **Partial** — some fields auto-extracted, others need manual input (marked with `# TODO:`). +- **Detect only** — integration detected but key fields (e.g., `server_url`, `pub_id`) require manual input. + +### Config output + +```toml +# Generated by js-asset-auditor on 2026-04-13 +# Source URL: https://www.publisher.com + +[publisher] +domain = "publisher.com" +# cookie_domain = ".publisher.com" +# origin_url = "https://origin.publisher.com" +# proxy_secret = "change-me" + +[integrations.gpt] +enabled = true +script_url = "https://securepubads.g.doubleclick.net/tag/js/gpt.js" # auto-detected +# cache_ttl_seconds = 3600 +# rewrite_script = true + +[integrations.google_tag_manager] +enabled = true +container_id = "GTM-TRCJMD6" # auto-detected + +[integrations.lockr] +enabled = true +sdk_url = "https://aim.loc.kr/identity-lockr-trust-server.js" # auto-detected +app_id = "" # TODO: set your Lockr Identity app_id +# api_endpoint = "https://identity.loc.kr" +``` + +If the target file already exists, the CLI errors unless `--force` is passed. + +--- + ## Implementation The Auditor is packaged as a Claude Code plugin at `packages/js-asset-auditor/` with three components: @@ -242,6 +302,7 @@ packages/js-asset-auditor/ ├── bin/audit-js-assets # Executable (added to PATH by Claude Code) ├── lib/ │ ├── audit.mjs # Playwright CLI — browser automation + orchestration +│ ├── detect.mjs # Integration detection engine + config generation │ ├── process.mjs # Processing library — normalization, filtering, slugs, TOML │ └── slug.mjs # Standalone slug generator ├── package.json # playwright dependency @@ -298,3 +359,8 @@ See [delivery order in the Proxy spec](2026-04-01-js-asset-proxy-design.md) _(on - Run `--diff` after removing a script → appears as `MISSING ⚠` in summary, file unchanged - Run `/js-asset-auditor:audit-js-assets ` via Claude Code plugin → identical results to direct CLI invocation - Run CLI without `trusted-server.toml` (using `--domain` or domain inference) → works in any project +- Run with `--config` → generates `trusted-server.toml` with detected integrations +- Verify GTM `container_id` is auto-extracted from `?id=GTM-XXXXX` query param +- Verify integrations with TODO fields are marked with `# TODO:` comments +- Verify `--config` without `--force` errors when target file exists +- Verify JSON summary includes `integrations` array when `--config` is used diff --git a/packages/js-asset-auditor/lib/audit.mjs b/packages/js-asset-auditor/lib/audit.mjs index 415f54d8..f5f5a29b 100644 --- a/packages/js-asset-auditor/lib/audit.mjs +++ b/packages/js-asset-auditor/lib/audit.mjs @@ -68,6 +68,8 @@ function parseArgs(argv) { noFilter: false, headed: false, output: "js-assets.toml", + config: null, + force: false, }; for (let i = 2; i < argv.length; i++) { @@ -86,6 +88,16 @@ function parseArgs(argv) { args.headed = true; } else if (arg === "--output") { args.output = argv[++i]; + } else if (arg === "--config") { + // --config with optional path: default to "trusted-server.toml" + const next = argv[i + 1]; + if (next && !next.startsWith("--")) { + args.config = argv[++i]; + } else { + args.config = "trusted-server.toml"; + } + } else if (arg === "--force") { + args.force = true; } else if (!arg.startsWith("--") && !args.url) { args.url = arg.startsWith("http") ? arg : `https://${arg}`; } else { @@ -96,7 +108,7 @@ function parseArgs(argv) { if (!args.url) { console.error( - "Usage: audit-js-assets [--diff] [--settle ] [--first-party ] [--no-filter] [--headed] [--output ]", + "Usage: audit-js-assets [--diff] [--settle ] [--first-party ] [--no-filter] [--headed] [--output ] [--config [path]] [--force]", ); process.exit(1); } @@ -208,6 +220,47 @@ export async function main() { : result.summary.new.length; console.error(`Wrote ${args.output} (${count} entries)`); + // Integration detection & config generation + if (args.config) { + const { detectIntegrations, generateConfig } = await import( + "./detect.mjs" + ); + const detection = detectIntegrations(scriptUrls); + + if (detection.integrations.length > 0) { + // Check if config file already exists + let fileExists = false; + try { + readFileSync(args.config); + fileExists = true; + } catch { + // File doesn't exist — safe to write + } + + if (fileExists && !args.force) { + console.error( + `${args.config} already exists. Use --force to overwrite.`, + ); + } else { + const configToml = generateConfig(domain, args.url, detection); + writeFileSync(args.config, configToml); + console.error( + `Wrote ${args.config} (${detection.integrations.length} integrations detected)`, + ); + } + } else { + console.error("No integrations detected — skipping config generation"); + } + + result.summary.integrations = detection.integrations.map((i) => ({ + id: i.id, + label: i.label, + category: i.category, + extracted: i.extracted, + todos: i.todos, + })); + } + console.log(JSON.stringify(result.summary)); } finally { if (browser.isConnected()) { diff --git a/packages/js-asset-auditor/lib/detect.mjs b/packages/js-asset-auditor/lib/detect.mjs new file mode 100644 index 00000000..d2be4980 --- /dev/null +++ b/packages/js-asset-auditor/lib/detect.mjs @@ -0,0 +1,278 @@ +// JS Asset Auditor — Integration Detection & Config Generation +// +// Detects known integrations from raw script URLs captured during a page sweep, +// then generates a trusted-server.toml with appropriate [integrations.*] sections. +// +// Integration patterns are derived from the Rust source in +// crates/trusted-server-core/src/integrations/. + +// --------------------------------------------------------------------------- +// Integration pattern registry +// --------------------------------------------------------------------------- + +const PREBID_SUFFIXES = ["/prebid.js", "/prebid.min.js", "/prebidjs.js", "/prebidjs.min.js"]; + +const INTEGRATION_PATTERNS = [ + { + id: "gpt", + label: "Google Publisher Tags", + match: (url) => + url.hostname === "securepubads.g.doubleclick.net" && + url.pathname.startsWith("/tag/js/gpt"), + extract: (url) => ({ + script_url: `${url.origin}${url.pathname}`, + }), + defaults: { + cache_ttl_seconds: 3600, + rewrite_script: true, + }, + todos: [], + category: "full", + }, + { + id: "google_tag_manager", + label: "Google Tag Manager", + match: (url) => + url.hostname === "www.googletagmanager.com" && + url.pathname.includes("/gtm.js"), + extract: (url) => { + const containerId = url.searchParams.get("id"); + return containerId ? { container_id: containerId } : {}; + }, + defaults: {}, + todos: (extracted) => (extracted.container_id ? [] : ["container_id"]), + category: "partial", + }, + { + id: "didomi", + label: "Didomi Consent", + match: (url) => + url.hostname === "sdk.privacy-center.org" || + url.hostname === "api.privacy-center.org", + extract: () => ({}), + defaults: { + sdk_origin: "https://sdk.privacy-center.org", + api_origin: "https://api.privacy-center.org", + }, + todos: [], + category: "full", + }, + { + id: "datadome", + label: "DataDome Bot Protection", + match: (url) => + url.hostname === "js.datadome.co" || + url.hostname === "api-js.datadome.co", + extract: () => ({}), + defaults: { + sdk_origin: "https://js.datadome.co", + api_origin: "https://api-js.datadome.co", + cache_ttl_seconds: 3600, + rewrite_sdk: true, + }, + todos: [], + category: "full", + }, + { + id: "lockr", + label: "Lockr Identity", + match: (url) => { + const href = url.href.toLowerCase(); + return ( + (url.hostname.includes("aim.loc.kr") || + url.hostname.includes("identity.loc.kr")) && + href.includes("identity-lockr") && + href.endsWith(".js") + ); + }, + extract: (url) => ({ + sdk_url: url.href, + }), + defaults: { + api_endpoint: "https://identity.loc.kr", + cache_ttl_seconds: 3600, + rewrite_sdk: true, + }, + todos: ["app_id"], + category: "partial", + }, + { + id: "permutive", + label: "Permutive DMP", + match: (url) => + (url.hostname.endsWith(".edge.permutive.app") || + url.hostname === "cdn.permutive.com") && + url.pathname.endsWith("-web.js"), + extract: (url) => { + const result = {}; + // Extract organization_id from subdomain: {org}.edge.permutive.app + if (url.hostname.endsWith(".edge.permutive.app")) { + result.organization_id = url.hostname.replace(".edge.permutive.app", ""); + } + // Extract workspace_id from filename: /{workspace}-web.js + const filename = url.pathname.split("/").pop() || ""; + const wsMatch = filename.match(/^(.+)-web\.js$/); + if (wsMatch) { + result.workspace_id = wsMatch[1]; + } + return result; + }, + defaults: { + api_endpoint: "https://api.permutive.com", + secure_signals_endpoint: "https://secure-signals.permutive.app", + }, + todos: (extracted) => { + const missing = []; + if (!extracted.organization_id) missing.push("organization_id"); + if (!extracted.workspace_id) missing.push("workspace_id"); + return missing; + }, + category: "partial", + }, + { + id: "prebid", + label: "Prebid Header Bidding", + match: (url) => PREBID_SUFFIXES.some((s) => url.pathname.endsWith(s)), + extract: () => ({}), + defaults: { + timeout_ms: 1000, + debug: false, + }, + todos: ["server_url", "bidders"], + category: "detect_only", + }, + { + id: "aps", + label: "Amazon Publisher Services", + match: (url) => + url.hostname === "c.amazon-adsystem.com" && + url.pathname.includes("/apstag"), + extract: () => ({}), + defaults: { + endpoint: "https://aax.amazon-adsystem.com/e/dtb/bid", + timeout_ms: 1000, + }, + todos: ["pub_id"], + category: "detect_only", + }, +]; + +// --------------------------------------------------------------------------- +// Detection +// --------------------------------------------------------------------------- + +export function detectIntegrations(rawUrls) { + const detected = new Map(); + + for (const rawUrl of rawUrls) { + let url; + try { + url = new URL(rawUrl); + } catch { + continue; + } + + for (const pattern of INTEGRATION_PATTERNS) { + if (!pattern.match(url)) continue; + + if (detected.has(pattern.id)) { + // Merge: accumulate source URLs, fill in missing extracted fields + const existing = detected.get(pattern.id); + existing.sourceUrls.push(rawUrl); + const newExtracted = pattern.extract(url); + for (const [key, value] of Object.entries(newExtracted)) { + if (!(key in existing.extracted)) existing.extracted[key] = value; + } + } else { + const extracted = pattern.extract(url); + const todos = + typeof pattern.todos === "function" + ? pattern.todos(extracted) + : [...pattern.todos]; + + detected.set(pattern.id, { + id: pattern.id, + label: pattern.label, + category: pattern.category, + extracted, + defaults: { ...pattern.defaults }, + todos, + sourceUrls: [rawUrl], + }); + } + + break; // Each URL matches at most one integration + } + } + + // Recalculate dynamic todos after merging + for (const [id, entry] of detected) { + const pattern = INTEGRATION_PATTERNS.find((p) => p.id === id); + if (pattern && typeof pattern.todos === "function") { + entry.todos = pattern.todos(entry.extracted); + } + } + + return { + integrations: [...detected.values()], + }; +} + +// --------------------------------------------------------------------------- +// Config generation +// --------------------------------------------------------------------------- + +function formatTomlValue(value) { + if (typeof value === "string") return `"${value}"`; + if (typeof value === "boolean") return value ? "true" : "false"; + if (typeof value === "number") return String(value); + if (Array.isArray(value)) + return `[${value.map((v) => `"${v}"`).join(", ")}]`; + return String(value); +} + +export function generateConfig(domain, targetUrl, detectionResult) { + const today = new Date().toISOString().slice(0, 10); + let toml = ""; + + // Header + toml += `# Generated by js-asset-auditor on ${today}\n`; + toml += `# Source URL: ${targetUrl}\n`; + toml += `#\n`; + toml += `# Review all values before deploying. Fields marked TODO need manual input.\n`; + toml += `# Commented-out fields show defaults — uncomment to override.\n`; + toml += `\n`; + + // Publisher section + toml += `[publisher]\n`; + toml += `domain = "${domain}"\n`; + toml += `# cookie_domain = ".${domain}"\n`; + toml += `# origin_url = "https://origin.${domain}"\n`; + toml += `# proxy_secret = "change-me"\n`; + + // Integration sections + for (const integration of detectionResult.integrations) { + toml += `\n`; + toml += `[integrations.${integration.id}]\n`; + toml += `enabled = true\n`; + + // Auto-extracted fields + for (const [key, value] of Object.entries(integration.extracted)) { + toml += `${key} = ${formatTomlValue(value)} # auto-detected\n`; + } + + // TODO fields + for (const field of integration.todos) { + toml += `${field} = "" # TODO: set your ${integration.label} ${field}\n`; + } + + // Default fields (commented out) + for (const [key, value] of Object.entries(integration.defaults)) { + // Skip if already in extracted + if (key in integration.extracted) continue; + toml += `# ${key} = ${formatTomlValue(value)}\n`; + } + } + + return toml; +} diff --git a/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md b/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md index fe9f190f..2165b1e6 100644 --- a/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md +++ b/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md @@ -7,7 +7,7 @@ Audit a publisher page for third-party JS assets and generate `js-assets.toml` e Usage: /js-asset-auditor:audit-js-assets $ARGUMENTS -`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...] [--no-filter] [--headed]` +`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...] [--no-filter] [--headed] [--config [path]] [--force]` --- @@ -62,3 +62,18 @@ Missing: {missing.length} asset(s) no longer seen on page ⚠ NEW {prefix} {shortUrl} → review in js-assets.toml MISSING {slug} {originUrl} → may have been removed or renamed ``` + +### Integration detection (when --config is used) + +If the JSON summary includes an `integrations` array, append: + +``` +Detected Integrations: + {id} ✓ fully configured + {id} ✓ {field}={value} (auto-detected) + {id} ⚠ {field} needs manual input + +→ {config path} generated with {count} integrations +``` + +Use ✓ for `full` category and integrations with no TODOs. Use ⚠ for integrations with TODO fields. From e0c7e0c380f812ca5b206b9445aeab2ed5b92897 Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 13 Apr 2026 14:56:33 -0500 Subject: [PATCH 12/12] Default to headed browser to avoid bot detection Switch from headless-by-default to headed-by-default. Sites with bot protection (DataDome, Cloudflare, etc.) block headless browsers. The --headed flag becomes --headless for CI/automation use cases. --- .../specs/2026-04-01-js-asset-auditor-design.md | 6 +++--- packages/js-asset-auditor/lib/audit.mjs | 10 +++++----- .../js-asset-auditor/skills/audit-js-assets/SKILL.md | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md index 348a00bc..207410b4 100644 --- a/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md +++ b/docs/superpowers/specs/2026-04-01-js-asset-auditor-design.md @@ -26,7 +26,7 @@ It also runs as a monitoring tool — `--diff` mode compares a new sweep against /js-asset-auditor:audit-js-assets https://www.publisher.com --diff # diff — compare against existing file /js-asset-auditor:audit-js-assets https://www.publisher.com --settle 15000 # longer settle for ad-tech-heavy pages /js-asset-auditor:audit-js-assets https://www.publisher.com --no-filter # bypass heuristic filtering -/js-asset-auditor:audit-js-assets https://www.publisher.com --headed # visible browser for debugging +/js-asset-auditor:audit-js-assets https://www.publisher.com --headless # headless mode for CI/automation /js-asset-auditor:audit-js-assets https://www.publisher.com --config # also generate trusted-server.toml # Direct CLI invocation (no Claude Code required) @@ -43,7 +43,7 @@ node packages/js-asset-auditor/lib/audit.mjs https://www.publisher.com --config The CLI (`packages/js-asset-auditor/lib/audit.mjs`) performs the full sweep: 1. Resolve publisher domain: `--domain` flag → `trusted-server.toml` → infer from target URL -2. Launch headless Chromium via Playwright (visible with `--headed`) +2. Launch Chromium via Playwright (headed by default to avoid bot detection; `--headless` for CI) 3. Register a response listener for `resourceType() === 'script'` to capture all script network requests 4. Navigate to target URL (`page.goto`, 30s timeout, follows redirects transparently) 5. Wait for page load settle: `page.waitForTimeout(SETTLE_MS)` where `SETTLE_MS` defaults to 6000 (configurable via `--settle `) @@ -309,7 +309,7 @@ packages/js-asset-auditor/ └── settings.json # Auto-grants Bash(audit-js-assets:*) permission ``` -1. **Playwright CLI** (`lib/audit.mjs`) — Launches headless Chromium, navigates to the target URL, collects script network requests and head script DOM state, then calls `processAssets()`. Outputs TOML file + JSON summary. Can be run directly without Claude Code. +1. **Playwright CLI** (`lib/audit.mjs`) — Launches Chromium (headed by default), navigates to the target URL, collects script network requests and head script DOM state, then calls `processAssets()`. Outputs TOML file + JSON summary. Can be run directly without Claude Code. 2. **Processing library** (`lib/process.mjs`) — Pure Node.js module (no external dependencies) that exports `processAssets()` and individual utility functions. Handles URL normalization, first-party filtering, heuristic filtering, wildcard detection, slug generation, and TOML formatting. 3. **Claude Code skill** (`skills/audit-js-assets/SKILL.md`) — Thin wrapper that invokes the CLI via the `bin/audit-js-assets` executable and formats the JSON summary. diff --git a/packages/js-asset-auditor/lib/audit.mjs b/packages/js-asset-auditor/lib/audit.mjs index f5f5a29b..7334ece8 100644 --- a/packages/js-asset-auditor/lib/audit.mjs +++ b/packages/js-asset-auditor/lib/audit.mjs @@ -66,7 +66,7 @@ function parseArgs(argv) { settle: 6000, firstParty: [], noFilter: false, - headed: false, + headless: false, output: "js-assets.toml", config: null, force: false, @@ -84,8 +84,8 @@ function parseArgs(argv) { args.firstParty = argv[++i].split(",").filter(Boolean); } else if (arg === "--no-filter") { args.noFilter = true; - } else if (arg === "--headed") { - args.headed = true; + } else if (arg === "--headless") { + args.headless = true; } else if (arg === "--output") { args.output = argv[++i]; } else if (arg === "--config") { @@ -108,7 +108,7 @@ function parseArgs(argv) { if (!args.url) { console.error( - "Usage: audit-js-assets [--diff] [--settle ] [--first-party ] [--no-filter] [--headed] [--output ] [--config [path]] [--force]", + "Usage: audit-js-assets [--diff] [--settle ] [--first-party ] [--no-filter] [--headless] [--output ] [--config [path]] [--force]", ); process.exit(1); } @@ -154,7 +154,7 @@ export async function main() { console.error(`Launching browser...`); let browser; try { - browser = await chromium.launch({ headless: !args.headed }); + browser = await chromium.launch({ headless: args.headless }); } catch (err) { if (err.message.includes("Executable doesn't exist")) { console.error( diff --git a/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md b/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md index 2165b1e6..8590f852 100644 --- a/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md +++ b/packages/js-asset-auditor/skills/audit-js-assets/SKILL.md @@ -7,7 +7,7 @@ Audit a publisher page for third-party JS assets and generate `js-assets.toml` e Usage: /js-asset-auditor:audit-js-assets $ARGUMENTS -`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...] [--no-filter] [--headed] [--config [path]] [--force]` +`$ARGUMENTS`: ` [--diff] [--settle ] [--first-party ,...] [--no-filter] [--headless] [--config [path]] [--force]` ---