diff --git a/crates/html-extractor-napi/__test__/binding.test.mjs b/crates/html-extractor-napi/__test__/binding.test.mjs
index d76a011..9ab7b78 100644
--- a/crates/html-extractor-napi/__test__/binding.test.mjs
+++ b/crates/html-extractor-napi/__test__/binding.test.mjs
@@ -65,6 +65,30 @@ test('conflicting options reject the promise', async () => {
)
})
+test('outputDecisions returns a keep/drop ledger', async () => {
+ const withChrome = `
+
+
+ Hello World
+ This is the first paragraph of an article, long enough to clear the extraction threshold with real prose preserved in the output.
+ A second paragraph gives the scored walk a solid main-content region to lock onto for this page.
+
+
+
+ `
+ const r = await extract(withChrome, { outputDecisions: true })
+ assert.ok(Array.isArray(r.decisions), 'decisions should be an array')
+ assert.ok(r.decisions.length >= 1)
+ assert.equal(r.decisions[0].kept, true, 'first entry is the kept root')
+ for (const d of r.decisions) {
+ assert.equal(typeof d.selector, 'string')
+ assert.ok(d.confidence >= 0 && d.confidence <= 1)
+ }
+ // default: no ledger
+ const plain = await extract(withChrome)
+ assert.ok(!plain.decisions, 'decisions omitted unless opted in')
+})
+
test('version() returns a semver-shaped string', () => {
const v = version()
assert.match(v, /^\d+\.\d+\.\d+/)
diff --git a/crates/html-extractor-napi/index.d.ts b/crates/html-extractor-napi/index.d.ts
index 9e04c75..03272ec 100644
--- a/crates/html-extractor-napi/index.d.ts
+++ b/crates/html-extractor-napi/index.d.ts
@@ -10,7 +10,10 @@ export interface ExtractOptions {
favorRecall?: boolean
/** Include a plain-text mirror of the markdown in the result. */
outputText?: boolean
- /** Reserved for Phase 4 (per-element decisions ledger). */
+ /**
+ * Populate `result.decisions`: the kept main container plus every
+ * boilerplate block dropped during post-clean. Off by default.
+ */
outputDecisions?: boolean
/** Hint for language detection. */
targetLanguage?: string
@@ -62,6 +65,17 @@ export interface ExtractStats {
pageType: string
}
+export interface Decision {
+ /** CSS-selector-shaped signature: tag + sorted classes + `#id`. */
+ selector: string
+ /** Fraction of the kept subtree's text this element held, `[0,1]`. */
+ score: number
+ /** Whether the element survived into the output. */
+ kept: boolean
+ /** Confidence in the keep/drop call, `[0,1]`. */
+ confidence: number
+}
+
export interface ExtractResult {
/** Cleaned main content as GitHub-flavored markdown. */
markdown: string
@@ -77,6 +91,8 @@ export interface ExtractResult {
metadata?: Metadata
/** Internal stats useful for telemetry. */
stats?: ExtractStats
+ /** Keep/drop ledger when `outputDecisions: true`: kept root then dropped blocks. */
+ decisions?: Decision[]
/** Reason for a low-confidence / failed extraction, if any. */
errorReason?: string
}
diff --git a/crates/html-extractor-napi/src/lib.rs b/crates/html-extractor-napi/src/lib.rs
index 9339ce4..0937ba6 100644
--- a/crates/html-extractor-napi/src/lib.rs
+++ b/crates/html-extractor-napi/src/lib.rs
@@ -52,6 +52,14 @@ pub struct ExtractStats {
pub page_type: String,
}
+#[napi(object)]
+pub struct Decision {
+ pub selector: String,
+ pub score: f64,
+ pub kept: bool,
+ pub confidence: f64,
+}
+
#[napi(object)]
pub struct ExtractResult {
pub markdown: String,
@@ -61,6 +69,7 @@ pub struct ExtractResult {
pub language: Option,
pub metadata: Option,
pub stats: Option,
+ pub decisions: Option>,
pub error_reason: Option,
}
@@ -141,6 +150,16 @@ fn map_result(r: html_extractor::ExtractResult, want_text: bool) -> ExtractResul
used_fallback: s.used_fallback,
page_type: s.page_type.to_string(),
});
+ let decisions = r.decisions.map(|ds| {
+ ds.into_iter()
+ .map(|d| Decision {
+ selector: d.selector,
+ score: d.score as f64,
+ kept: d.kept,
+ confidence: d.confidence as f64,
+ })
+ .collect()
+ });
ExtractResult {
markdown: r.markdown,
text,
@@ -149,6 +168,7 @@ fn map_result(r: html_extractor::ExtractResult, want_text: bool) -> ExtractResul
language: r.language,
metadata,
stats,
+ decisions,
error_reason: r.error_reason.map(|e| e.to_string()),
}
}
@@ -229,6 +249,7 @@ fn too_large_result(html_len: usize, limit: usize) -> ExtractResult {
language: None,
metadata: None,
stats: None,
+ decisions: None,
error_reason: Some(format!(
"input_too_large: {html_len} bytes exceeds max_input_size {limit}"
)),
diff --git a/crates/html-extractor/src/clean.rs b/crates/html-extractor/src/clean.rs
index e2d5d60..87489d5 100644
--- a/crates/html-extractor/src/clean.rs
+++ b/crates/html-extractor/src/clean.rs
@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
use regex::Regex;
use crate::tree::Tree;
-use crate::types::ExtractOptions;
+use crate::types::{Decision, ExtractOptions};
/// Tags whose entire subtree is never content.
const KILL_TAGS: &[&str] = &[
@@ -131,6 +131,8 @@ fn post_clean_inner(
// descendant indices that the renderer will respect. This keeps `tree`
// shareable with other passes (e.g. for fallback retries).
let mut skip: std::collections::HashSet = std::collections::HashSet::new();
+ // Only allocated when the caller asked for the decisions ledger.
+ let mut decisions: Vec = Vec::new();
// Subtree text/link-char metrics for the whole selection, computed in one
// post-order pass. Looking these up is O(1); the previous code called
// `full_text` per node, which re-walked each subtree (O(N²)).
@@ -182,6 +184,30 @@ fn post_clean_inner(
{
skip.insert(idx);
stripped_total += idx_text;
+ if options.output_decisions {
+ let link_density = if idx_text > 0 {
+ (metrics.link_chars[idx] as f32 / idx_text as f32).clamp(0.0, 1.0)
+ } else {
+ 0.0
+ };
+ decisions.push(Decision {
+ selector: elem.selector(),
+ // Share of the kept subtree's text this dropped block held.
+ score: if root_text_len > 0 {
+ idx_text as f32 / root_text_len as f32
+ } else {
+ 0.0
+ },
+ kept: false,
+ // An explicit chrome/share class match is a strong signal;
+ // a link-density-only drop is only as sure as the ratio.
+ confidence: if chrome_hit || share_hit {
+ 0.9
+ } else {
+ link_density
+ },
+ });
+ }
// don't descend into a dropped subtree
continue;
}
@@ -189,13 +215,19 @@ fn post_clean_inner(
stack.push(c);
}
}
- CleanedRoot { root, skip }
+ CleanedRoot {
+ root,
+ skip,
+ decisions,
+ }
}
-/// Result of post-clean: a root index plus a set of subtrees to skip.
+/// Result of post-clean: a root index, the set of subtrees to skip, and (when
+/// `output_decisions` is set) the per-drop decisions recorded along the way.
pub(crate) struct CleanedRoot {
pub root: usize,
pub skip: std::collections::HashSet,
+ pub decisions: Vec,
}
static CHROME_RE: Lazy = Lazy::new(|| {
diff --git a/crates/html-extractor/src/lib.rs b/crates/html-extractor/src/lib.rs
index 9d85417..f1769a1 100644
--- a/crates/html-extractor/src/lib.rs
+++ b/crates/html-extractor/src/lib.rs
@@ -16,7 +16,9 @@ mod scoring;
mod tree;
mod types;
-pub use types::{ExtractError, ExtractOptions, ExtractResult, ExtractStats, Metadata, PageType};
+pub use types::{
+ Decision, ExtractError, ExtractOptions, ExtractResult, ExtractStats, Metadata, PageType,
+};
/// Extract the main content of an HTML document, returning a structured
/// [`ExtractResult`] containing markdown, page-type, metadata, and a confidence
@@ -150,15 +152,30 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result Result String {
+ let mut s = self.tag.to_string();
+ if let Some(class) = self.attr("class") {
+ let mut classes: Vec<&str> = class.split_whitespace().collect();
+ classes.sort_unstable();
+ for c in classes {
+ s.push('.');
+ s.push_str(c);
+ }
+ }
+ if let Some(id) = self.attr("id") {
+ if !id.trim().is_empty() {
+ s.push('#');
+ s.push_str(id.trim());
+ }
+ }
+ s
+ }
+
pub fn attr(&self, name: &str) -> Option<&str> {
self.attrs
.iter()
diff --git a/crates/html-extractor/src/types.rs b/crates/html-extractor/src/types.rs
index 7795bf8..bfc10da 100644
--- a/crates/html-extractor/src/types.rs
+++ b/crates/html-extractor/src/types.rs
@@ -63,8 +63,9 @@ pub struct ExtractOptions {
pub favor_recall: bool,
/// Include the plain-text version in the result.
pub output_text: bool,
- /// Include the per-element decisions ledger (currently always `None`; see
- /// `DECISIONS.md` D10).
+ /// Populate the per-element [`ExtractResult::decisions`] ledger: the
+ /// kept main container plus every boilerplate block post-clean dropped
+ /// inside it. Off by default — building it costs an allocation per drop.
pub output_decisions: bool,
/// Hint for language detection.
pub target_language: Option,
@@ -116,7 +117,9 @@ pub struct ExtractResult {
pub language: Option,
/// Metadata pulled from JSON-LD, OpenGraph, etc.
pub metadata: Option,
- /// Per-element ledger (Phase 4; presently always `None`).
+ /// Per-element keep/drop ledger. `None` unless
+ /// [`ExtractOptions::output_decisions`] was set; otherwise the kept main
+ /// container followed by each boilerplate block post-clean dropped.
pub decisions: Option>,
/// Stats describing what happened internally.
pub stats: Option,
@@ -166,16 +169,22 @@ pub struct Metadata {
pub schema_type: Option,
}
-/// Per-element decision (Phase 4 stub).
+/// A single keep/drop decision recorded during extraction, for telemetry and
+/// for the offline rule-learner to mine boilerplate-container signatures.
#[derive(Debug, Clone)]
pub struct Decision {
- /// Selector identifying the element.
+ /// CSS-selector-shaped signature: `tag` + sorted `.class`es + `#id`
+ /// (e.g. `div.related.sidebar#aside`). Stable enough to aggregate across
+ /// pages of the same template.
pub selector: String,
- /// Raw score.
+ /// Fraction of the kept subtree's text contained in this element, `[0, 1]`.
+ /// Near-zero for a small dropped widget; ~1.0 for the kept root.
pub score: f32,
- /// `true` if the element was kept in the output.
+ /// Whether the element survived into the output. `true` for the main
+ /// container, `false` for each dropped boilerplate block.
pub kept: bool,
- /// Confidence in `[0.0, 1.0]`.
+ /// Confidence in the keep/drop call, `[0, 1]`. High for explicit
+ /// chrome/share class matches; the link density for link-dense drops.
pub confidence: f32,
}
diff --git a/crates/html-extractor/tests/integration_basic.rs b/crates/html-extractor/tests/integration_basic.rs
index b47840b..1269ce6 100644
--- a/crates/html-extractor/tests/integration_basic.rs
+++ b/crates/html-extractor/tests/integration_basic.rs
@@ -86,6 +86,40 @@ fn page_type_override_is_respected() {
assert_eq!(r.page_type, PageType::Documentation);
}
+#[test]
+fn decisions_ledger_off_by_default() {
+ let r = extract(SIMPLE_ARTICLE, &ExtractOptions::default()).unwrap();
+ assert!(r.decisions.is_none(), "ledger must be opt-in");
+}
+
+#[test]
+fn decisions_ledger_records_kept_root_and_dropped_chrome() {
+ let opts = ExtractOptions {
+ output_decisions: true,
+ ..Default::default()
+ };
+ let r = extract(SIMPLE_ARTICLE, &opts).unwrap();
+ let decisions = r.decisions.expect("ledger present when opted in");
+
+ // First entry is the kept main container.
+ let root = &decisions[0];
+ assert!(root.kept, "first decision should be the kept root");
+ assert!(!root.selector.is_empty());
+
+ // The related-stories sidebar and footer are dropped — at least one
+ // kept=false entry, each carrying a non-empty selector.
+ let drops: Vec<_> = decisions.iter().filter(|d| !d.kept).collect();
+ assert!(
+ !drops.is_empty(),
+ "expected at least one dropped boilerplate block, got {decisions:?}"
+ );
+ for d in &drops {
+ assert!(!d.selector.is_empty(), "drop selector should be non-empty");
+ assert!((0.0..=1.0).contains(&d.confidence));
+ assert!((0.0..=1.0).contains(&d.score));
+ }
+}
+
#[test]
fn no_panic_on_malformed_input() {
// unclosed tags, weird structure