firecrawl · abimaelmartell · May 28, 2026
diff --git a/crates/html-extractor-napi/__test__/binding.test.mjs b/crates/html-extractor-napi/__test__/binding.test.mjs
@@ -65,6 +65,30 @@ test('conflicting options reject the promise', async () => {
   )
 })
 
+test('outputDecisions returns a keep/drop ledger', async () => {
+  const withChrome = `
+    <html><body>
+      <main><article>
+        <h1>Hello World</h1>
+        <p>This is the first paragraph of an article, long enough to clear the extraction threshold with real prose preserved in the output.</p>
+        <p>A second paragraph gives the scored walk a solid main-content region to lock onto for this page.</p>
+        <aside class="related-stories"><a href="/a">Other story one</a><a href="/b">Other story two</a></aside>
+      </article></main>
+      <footer class="site-footer">© 2024 ExampleSite</footer>
+    </body></html>`
+  const r = await extract(withChrome, { outputDecisions: true })
+  assert.ok(Array.isArray(r.decisions), 'decisions should be an array')
+  assert.ok(r.decisions.length >= 1)
+  assert.equal(r.decisions[0].kept, true, 'first entry is the kept root')
+  for (const d of r.decisions) {
+    assert.equal(typeof d.selector, 'string')
+    assert.ok(d.confidence >= 0 && d.confidence <= 1)
+  }
+  // default: no ledger
+  const plain = await extract(withChrome)
+  assert.ok(!plain.decisions, 'decisions omitted unless opted in')
+})
+
 test('version() returns a semver-shaped string', () => {
   const v = version()
   assert.match(v, /^\d+\.\d+\.\d+/)

diff --git a/crates/html-extractor-napi/index.d.ts b/crates/html-extractor-napi/index.d.ts
@@ -10,7 +10,10 @@ export interface ExtractOptions {
   favorRecall?: boolean
   /** Include a plain-text mirror of the markdown in the result. */
   outputText?: boolean
-  /** Reserved for Phase 4 (per-element decisions ledger). */
+  /**
+   * Populate `result.decisions`: the kept main container plus every
+   * boilerplate block dropped during post-clean. Off by default.
+   */
   outputDecisions?: boolean
   /** Hint for language detection. */
   targetLanguage?: string
@@ -62,6 +65,17 @@ export interface ExtractStats {
   pageType: string
 }
 
+export interface Decision {
+  /** CSS-selector-shaped signature: tag + sorted classes + `#id`. */
+  selector: string
+  /** Fraction of the kept subtree's text this element held, `[0,1]`. */
+  score: number
+  /** Whether the element survived into the output. */
+  kept: boolean
+  /** Confidence in the keep/drop call, `[0,1]`. */
+  confidence: number
+}
+
 export interface ExtractResult {
   /** Cleaned main content as GitHub-flavored markdown. */
   markdown: string
@@ -77,6 +91,8 @@ export interface ExtractResult {
   metadata?: Metadata
   /** Internal stats useful for telemetry. */
   stats?: ExtractStats
+  /** Keep/drop ledger when `outputDecisions: true`: kept root then dropped blocks. */
+  decisions?: Decision[]
   /** Reason for a low-confidence / failed extraction, if any. */
   errorReason?: string
 }

diff --git a/crates/html-extractor-napi/src/lib.rs b/crates/html-extractor-napi/src/lib.rs
@@ -52,6 +52,14 @@ pub struct ExtractStats {
     pub page_type: String,
 }
 
+#[napi(object)]
+pub struct Decision {
+    pub selector: String,
+    pub score: f64,
+    pub kept: bool,
+    pub confidence: f64,
+}
+
 #[napi(object)]
 pub struct ExtractResult {
     pub markdown: String,
@@ -61,6 +69,7 @@ pub struct ExtractResult {
     pub language: Option<String>,
     pub metadata: Option<Metadata>,
     pub stats: Option<ExtractStats>,
+    pub decisions: Option<Vec<Decision>>,
     pub error_reason: Option<String>,
 }
 
@@ -141,6 +150,16 @@ fn map_result(r: html_extractor::ExtractResult, want_text: bool) -> ExtractResul
         used_fallback: s.used_fallback,
         page_type: s.page_type.to_string(),
     });
+    let decisions = r.decisions.map(|ds| {
+        ds.into_iter()
+            .map(|d| Decision {
+                selector: d.selector,
+                score: d.score as f64,
+                kept: d.kept,
+                confidence: d.confidence as f64,
+            })
+            .collect()
+    });
     ExtractResult {
         markdown: r.markdown,
         text,
@@ -149,6 +168,7 @@ fn map_result(r: html_extractor::ExtractResult, want_text: bool) -> ExtractResul
         language: r.language,
         metadata,
         stats,
+        decisions,
         error_reason: r.error_reason.map(|e| e.to_string()),
     }
 }
@@ -229,6 +249,7 @@ fn too_large_result(html_len: usize, limit: usize) -> ExtractResult {
         language: None,
         metadata: None,
         stats: None,
+        decisions: None,
         error_reason: Some(format!(
             "input_too_large: {html_len} bytes exceeds max_input_size {limit}"
         )),

diff --git a/crates/html-extractor/src/clean.rs b/crates/html-extractor/src/clean.rs
@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 
 use crate::tree::Tree;
-use crate::types::ExtractOptions;
+use crate::types::{Decision, ExtractOptions};
 
 /// Tags whose entire subtree is never content.
 const KILL_TAGS: &[&str] = &[
@@ -131,6 +131,8 @@ fn post_clean_inner(
     // descendant indices that the renderer will respect. This keeps `tree`
     // shareable with other passes (e.g. for fallback retries).
     let mut skip: std::collections::HashSet<usize> = std::collections::HashSet::new();
+    // Only allocated when the caller asked for the decisions ledger.
+    let mut decisions: Vec<Decision> = Vec::new();
     // Subtree text/link-char metrics for the whole selection, computed in one
     // post-order pass. Looking these up is O(1); the previous code called
     // `full_text` per node, which re-walked each subtree (O(N²)).
@@ -182,20 +184,50 @@ fn post_clean_inner(
         {
             skip.insert(idx);
             stripped_total += idx_text;
+            if options.output_decisions {
+                let link_density = if idx_text > 0 {
+                    (metrics.link_chars[idx] as f32 / idx_text as f32).clamp(0.0, 1.0)
+                } else {
+                    0.0
+                };
+                decisions.push(Decision {
+                    selector: elem.selector(),
+                    // Share of the kept subtree's text this dropped block held.
+                    score: if root_text_len > 0 {
+                        idx_text as f32 / root_text_len as f32
+                    } else {
+                        0.0
+                    },
+                    kept: false,
+                    // An explicit chrome/share class match is a strong signal;
+                    // a link-density-only drop is only as sure as the ratio.
+                    confidence: if chrome_hit || share_hit {
+                        0.9
+                    } else {
+                        link_density
+                    },
+                });
+            }
             // don't descend into a dropped subtree
             continue;
         }
         for &c in &elem.children {
             stack.push(c);
         }
     }
-    CleanedRoot { root, skip }
+    CleanedRoot {
+        root,
+        skip,
+        decisions,
+    }
 }
 
-/// Result of post-clean: a root index plus a set of subtrees to skip.
+/// Result of post-clean: a root index, the set of subtrees to skip, and (when
+/// `output_decisions` is set) the per-drop decisions recorded along the way.
 pub(crate) struct CleanedRoot {
     pub root: usize,
     pub skip: std::collections::HashSet<usize>,
+    pub decisions: Vec<Decision>,
 }
 
 static CHROME_RE: Lazy<Regex> = Lazy::new(|| {

diff --git a/crates/html-extractor/src/lib.rs b/crates/html-extractor/src/lib.rs
@@ -16,7 +16,9 @@ mod scoring;
 mod tree;
 mod types;
 
-pub use types::{ExtractError, ExtractOptions, ExtractResult, ExtractStats, Metadata, PageType};
+pub use types::{
+    Decision, ExtractError, ExtractOptions, ExtractResult, ExtractStats, Metadata, PageType,
+};
 
 /// Extract the main content of an HTML document, returning a structured
 /// [`ExtractResult`] containing markdown, page-type, metadata, and a confidence
@@ -150,15 +152,30 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result<ExtractResult, Ex
     };
 
     // Stage 5: post-clean within the kept subtree, then render.
-    let (markdown, text_chars) = if let Some(root) = final_root {
+    let (markdown, text_chars, decisions) = if let Some(root) = final_root {
         let cleaned = if matches!(page_type, PageType::Listing | PageType::Collection) {
             clean::post_clean_lenient_links(&tree, root, options)
         } else {
             clean::post_clean(&tree, root, options)
         };
-        render::render(&tree, &cleaned, options)
+        let (markdown, text_chars) = render::render(&tree, &cleaned, options);
+        // Ledger: the kept main container followed by each dropped block.
+        let decisions = if options.output_decisions {
+            let mut v = Vec::with_capacity(cleaned.decisions.len() + 1);
+            v.push(Decision {
+                selector: tree.get(root).selector(),
+                score: 1.0,
+                kept: true,
+                confidence: quality,
+            });
+            v.extend(cleaned.decisions);
+            Some(v)
+        } else {
+            None
+        };
+        (markdown, text_chars, decisions)
     } else {
-        (String::new(), 0)
+        (String::new(), 0, None)
     };
 
     let stats = ExtractStats {
@@ -180,7 +197,7 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result<ExtractResult, Ex
         } else {
             None
         },
-        decisions: None,
+        decisions,
         stats: Some(stats),
         error_reason: None,
     })

diff --git a/crates/html-extractor/src/tree.rs b/crates/html-extractor/src/tree.rs
@@ -49,6 +49,28 @@ impl Element {
         out.to_lowercase()
     }
 
+    /// CSS-selector-shaped signature for this element: `tag` + sorted
+    /// `.class`es + `#id` (e.g. `div.related.sidebar#aside`). Used by the
+    /// decisions ledger; deterministic so it aggregates across pages.
+    pub fn selector(&self) -> String {
+        let mut s = self.tag.to_string();
+        if let Some(class) = self.attr("class") {
+            let mut classes: Vec<&str> = class.split_whitespace().collect();
+            classes.sort_unstable();
+            for c in classes {
+                s.push('.');
+                s.push_str(c);
+            }
+        }
+        if let Some(id) = self.attr("id") {
+            if !id.trim().is_empty() {
+                s.push('#');
+                s.push_str(id.trim());
+            }
+        }
+        s
+    }
+
     pub fn attr(&self, name: &str) -> Option<&str> {
         self.attrs
             .iter()

diff --git a/crates/html-extractor/src/types.rs b/crates/html-extractor/src/types.rs
@@ -63,8 +63,9 @@ pub struct ExtractOptions {
     pub favor_recall: bool,
     /// Include the plain-text version in the result.
     pub output_text: bool,
-    /// Include the per-element decisions ledger (currently always `None`; see
-    /// `DECISIONS.md` D10).
+    /// Populate the per-element [`ExtractResult::decisions`] ledger: the
+    /// kept main container plus every boilerplate block post-clean dropped
+    /// inside it. Off by default — building it costs an allocation per drop.
     pub output_decisions: bool,
     /// Hint for language detection.
     pub target_language: Option<String>,
@@ -116,7 +117,9 @@ pub struct ExtractResult {
     pub language: Option<String>,
     /// Metadata pulled from JSON-LD, OpenGraph, etc.
     pub metadata: Option<Metadata>,
-    /// Per-element ledger (Phase 4; presently always `None`).
+    /// Per-element keep/drop ledger. `None` unless
+    /// [`ExtractOptions::output_decisions`] was set; otherwise the kept main
+    /// container followed by each boilerplate block post-clean dropped.
     pub decisions: Option<Vec<Decision>>,
     /// Stats describing what happened internally.
     pub stats: Option<ExtractStats>,
@@ -166,16 +169,22 @@ pub struct Metadata {
     pub schema_type: Option<String>,
 }
 
-/// Per-element decision (Phase 4 stub).
+/// A single keep/drop decision recorded during extraction, for telemetry and
+/// for the offline rule-learner to mine boilerplate-container signatures.
 #[derive(Debug, Clone)]
 pub struct Decision {
-    /// Selector identifying the element.
+    /// CSS-selector-shaped signature: `tag` + sorted `.class`es + `#id`
+    /// (e.g. `div.related.sidebar#aside`). Stable enough to aggregate across
+    /// pages of the same template.
     pub selector: String,
-    /// Raw score.
+    /// Fraction of the kept subtree's text contained in this element, `[0, 1]`.
+    /// Near-zero for a small dropped widget; ~1.0 for the kept root.
     pub score: f32,
-    /// `true` if the element was kept in the output.
+    /// Whether the element survived into the output. `true` for the main
+    /// container, `false` for each dropped boilerplate block.
     pub kept: bool,
-    /// Confidence in `[0.0, 1.0]`.
+    /// Confidence in the keep/drop call, `[0, 1]`. High for explicit
+    /// chrome/share class matches; the link density for link-dense drops.
     pub confidence: f32,
 }
 

diff --git a/crates/html-extractor/tests/integration_basic.rs b/crates/html-extractor/tests/integration_basic.rs
@@ -86,6 +86,40 @@ fn page_type_override_is_respected() {
     assert_eq!(r.page_type, PageType::Documentation);
 }
 
+#[test]
+fn decisions_ledger_off_by_default() {
+    let r = extract(SIMPLE_ARTICLE, &ExtractOptions::default()).unwrap();
+    assert!(r.decisions.is_none(), "ledger must be opt-in");
+}
+
+#[test]
+fn decisions_ledger_records_kept_root_and_dropped_chrome() {
+    let opts = ExtractOptions {
+        output_decisions: true,
+        ..Default::default()
+    };
+    let r = extract(SIMPLE_ARTICLE, &opts).unwrap();
+    let decisions = r.decisions.expect("ledger present when opted in");
+
+    // First entry is the kept main container.
+    let root = &decisions[0];
+    assert!(root.kept, "first decision should be the kept root");
+    assert!(!root.selector.is_empty());
+
+    // The related-stories sidebar and footer are dropped — at least one
+    // kept=false entry, each carrying a non-empty selector.
+    let drops: Vec<_> = decisions.iter().filter(|d| !d.kept).collect();
+    assert!(
+        !drops.is_empty(),
+        "expected at least one dropped boilerplate block, got {decisions:?}"
+    );
+    for d in &drops {
+        assert!(!d.selector.is_empty(), "drop selector should be non-empty");
+        assert!((0.0..=1.0).contains(&d.confidence));
+        assert!((0.0..=1.0).contains(&d.score));
+    }
+}
+
 #[test]
 fn no_panic_on_malformed_input() {
     // unclosed tags, weird structure