diff --git a/crates/html-extractor-napi/__test__/binding.test.mjs b/crates/html-extractor-napi/__test__/binding.test.mjs index d76a011..9ab7b78 100644 --- a/crates/html-extractor-napi/__test__/binding.test.mjs +++ b/crates/html-extractor-napi/__test__/binding.test.mjs @@ -65,6 +65,30 @@ test('conflicting options reject the promise', async () => { ) }) +test('outputDecisions returns a keep/drop ledger', async () => { + const withChrome = ` + +
+

Hello World

+

This is the first paragraph of an article, long enough to clear the extraction threshold with real prose preserved in the output.

+

A second paragraph gives the scored walk a solid main-content region to lock onto for this page.

+ +
+ + ` + const r = await extract(withChrome, { outputDecisions: true }) + assert.ok(Array.isArray(r.decisions), 'decisions should be an array') + assert.ok(r.decisions.length >= 1) + assert.equal(r.decisions[0].kept, true, 'first entry is the kept root') + for (const d of r.decisions) { + assert.equal(typeof d.selector, 'string') + assert.ok(d.confidence >= 0 && d.confidence <= 1) + } + // default: no ledger + const plain = await extract(withChrome) + assert.ok(!plain.decisions, 'decisions omitted unless opted in') +}) + test('version() returns a semver-shaped string', () => { const v = version() assert.match(v, /^\d+\.\d+\.\d+/) diff --git a/crates/html-extractor-napi/index.d.ts b/crates/html-extractor-napi/index.d.ts index 9e04c75..03272ec 100644 --- a/crates/html-extractor-napi/index.d.ts +++ b/crates/html-extractor-napi/index.d.ts @@ -10,7 +10,10 @@ export interface ExtractOptions { favorRecall?: boolean /** Include a plain-text mirror of the markdown in the result. */ outputText?: boolean - /** Reserved for Phase 4 (per-element decisions ledger). */ + /** + * Populate `result.decisions`: the kept main container plus every + * boilerplate block dropped during post-clean. Off by default. + */ outputDecisions?: boolean /** Hint for language detection. */ targetLanguage?: string @@ -62,6 +65,17 @@ export interface ExtractStats { pageType: string } +export interface Decision { + /** CSS-selector-shaped signature: tag + sorted classes + `#id`. */ + selector: string + /** Fraction of the kept subtree's text this element held, `[0,1]`. */ + score: number + /** Whether the element survived into the output. */ + kept: boolean + /** Confidence in the keep/drop call, `[0,1]`. */ + confidence: number +} + export interface ExtractResult { /** Cleaned main content as GitHub-flavored markdown. */ markdown: string @@ -77,6 +91,8 @@ export interface ExtractResult { metadata?: Metadata /** Internal stats useful for telemetry. */ stats?: ExtractStats + /** Keep/drop ledger when `outputDecisions: true`: kept root then dropped blocks. */ + decisions?: Decision[] /** Reason for a low-confidence / failed extraction, if any. */ errorReason?: string } diff --git a/crates/html-extractor-napi/src/lib.rs b/crates/html-extractor-napi/src/lib.rs index 9339ce4..0937ba6 100644 --- a/crates/html-extractor-napi/src/lib.rs +++ b/crates/html-extractor-napi/src/lib.rs @@ -52,6 +52,14 @@ pub struct ExtractStats { pub page_type: String, } +#[napi(object)] +pub struct Decision { + pub selector: String, + pub score: f64, + pub kept: bool, + pub confidence: f64, +} + #[napi(object)] pub struct ExtractResult { pub markdown: String, @@ -61,6 +69,7 @@ pub struct ExtractResult { pub language: Option, pub metadata: Option, pub stats: Option, + pub decisions: Option>, pub error_reason: Option, } @@ -141,6 +150,16 @@ fn map_result(r: html_extractor::ExtractResult, want_text: bool) -> ExtractResul used_fallback: s.used_fallback, page_type: s.page_type.to_string(), }); + let decisions = r.decisions.map(|ds| { + ds.into_iter() + .map(|d| Decision { + selector: d.selector, + score: d.score as f64, + kept: d.kept, + confidence: d.confidence as f64, + }) + .collect() + }); ExtractResult { markdown: r.markdown, text, @@ -149,6 +168,7 @@ fn map_result(r: html_extractor::ExtractResult, want_text: bool) -> ExtractResul language: r.language, metadata, stats, + decisions, error_reason: r.error_reason.map(|e| e.to_string()), } } @@ -229,6 +249,7 @@ fn too_large_result(html_len: usize, limit: usize) -> ExtractResult { language: None, metadata: None, stats: None, + decisions: None, error_reason: Some(format!( "input_too_large: {html_len} bytes exceeds max_input_size {limit}" )), diff --git a/crates/html-extractor/src/clean.rs b/crates/html-extractor/src/clean.rs index e2d5d60..87489d5 100644 --- a/crates/html-extractor/src/clean.rs +++ b/crates/html-extractor/src/clean.rs @@ -11,7 +11,7 @@ use once_cell::sync::Lazy; use regex::Regex; use crate::tree::Tree; -use crate::types::ExtractOptions; +use crate::types::{Decision, ExtractOptions}; /// Tags whose entire subtree is never content. const KILL_TAGS: &[&str] = &[ @@ -131,6 +131,8 @@ fn post_clean_inner( // descendant indices that the renderer will respect. This keeps `tree` // shareable with other passes (e.g. for fallback retries). let mut skip: std::collections::HashSet = std::collections::HashSet::new(); + // Only allocated when the caller asked for the decisions ledger. + let mut decisions: Vec = Vec::new(); // Subtree text/link-char metrics for the whole selection, computed in one // post-order pass. Looking these up is O(1); the previous code called // `full_text` per node, which re-walked each subtree (O(N²)). @@ -182,6 +184,30 @@ fn post_clean_inner( { skip.insert(idx); stripped_total += idx_text; + if options.output_decisions { + let link_density = if idx_text > 0 { + (metrics.link_chars[idx] as f32 / idx_text as f32).clamp(0.0, 1.0) + } else { + 0.0 + }; + decisions.push(Decision { + selector: elem.selector(), + // Share of the kept subtree's text this dropped block held. + score: if root_text_len > 0 { + idx_text as f32 / root_text_len as f32 + } else { + 0.0 + }, + kept: false, + // An explicit chrome/share class match is a strong signal; + // a link-density-only drop is only as sure as the ratio. + confidence: if chrome_hit || share_hit { + 0.9 + } else { + link_density + }, + }); + } // don't descend into a dropped subtree continue; } @@ -189,13 +215,19 @@ fn post_clean_inner( stack.push(c); } } - CleanedRoot { root, skip } + CleanedRoot { + root, + skip, + decisions, + } } -/// Result of post-clean: a root index plus a set of subtrees to skip. +/// Result of post-clean: a root index, the set of subtrees to skip, and (when +/// `output_decisions` is set) the per-drop decisions recorded along the way. pub(crate) struct CleanedRoot { pub root: usize, pub skip: std::collections::HashSet, + pub decisions: Vec, } static CHROME_RE: Lazy = Lazy::new(|| { diff --git a/crates/html-extractor/src/lib.rs b/crates/html-extractor/src/lib.rs index 9d85417..f1769a1 100644 --- a/crates/html-extractor/src/lib.rs +++ b/crates/html-extractor/src/lib.rs @@ -16,7 +16,9 @@ mod scoring; mod tree; mod types; -pub use types::{ExtractError, ExtractOptions, ExtractResult, ExtractStats, Metadata, PageType}; +pub use types::{ + Decision, ExtractError, ExtractOptions, ExtractResult, ExtractStats, Metadata, PageType, +}; /// Extract the main content of an HTML document, returning a structured /// [`ExtractResult`] containing markdown, page-type, metadata, and a confidence @@ -150,15 +152,30 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result Result String { + let mut s = self.tag.to_string(); + if let Some(class) = self.attr("class") { + let mut classes: Vec<&str> = class.split_whitespace().collect(); + classes.sort_unstable(); + for c in classes { + s.push('.'); + s.push_str(c); + } + } + if let Some(id) = self.attr("id") { + if !id.trim().is_empty() { + s.push('#'); + s.push_str(id.trim()); + } + } + s + } + pub fn attr(&self, name: &str) -> Option<&str> { self.attrs .iter() diff --git a/crates/html-extractor/src/types.rs b/crates/html-extractor/src/types.rs index 7795bf8..bfc10da 100644 --- a/crates/html-extractor/src/types.rs +++ b/crates/html-extractor/src/types.rs @@ -63,8 +63,9 @@ pub struct ExtractOptions { pub favor_recall: bool, /// Include the plain-text version in the result. pub output_text: bool, - /// Include the per-element decisions ledger (currently always `None`; see - /// `DECISIONS.md` D10). + /// Populate the per-element [`ExtractResult::decisions`] ledger: the + /// kept main container plus every boilerplate block post-clean dropped + /// inside it. Off by default — building it costs an allocation per drop. pub output_decisions: bool, /// Hint for language detection. pub target_language: Option, @@ -116,7 +117,9 @@ pub struct ExtractResult { pub language: Option, /// Metadata pulled from JSON-LD, OpenGraph, etc. pub metadata: Option, - /// Per-element ledger (Phase 4; presently always `None`). + /// Per-element keep/drop ledger. `None` unless + /// [`ExtractOptions::output_decisions`] was set; otherwise the kept main + /// container followed by each boilerplate block post-clean dropped. pub decisions: Option>, /// Stats describing what happened internally. pub stats: Option, @@ -166,16 +169,22 @@ pub struct Metadata { pub schema_type: Option, } -/// Per-element decision (Phase 4 stub). +/// A single keep/drop decision recorded during extraction, for telemetry and +/// for the offline rule-learner to mine boilerplate-container signatures. #[derive(Debug, Clone)] pub struct Decision { - /// Selector identifying the element. + /// CSS-selector-shaped signature: `tag` + sorted `.class`es + `#id` + /// (e.g. `div.related.sidebar#aside`). Stable enough to aggregate across + /// pages of the same template. pub selector: String, - /// Raw score. + /// Fraction of the kept subtree's text contained in this element, `[0, 1]`. + /// Near-zero for a small dropped widget; ~1.0 for the kept root. pub score: f32, - /// `true` if the element was kept in the output. + /// Whether the element survived into the output. `true` for the main + /// container, `false` for each dropped boilerplate block. pub kept: bool, - /// Confidence in `[0.0, 1.0]`. + /// Confidence in the keep/drop call, `[0, 1]`. High for explicit + /// chrome/share class matches; the link density for link-dense drops. pub confidence: f32, } diff --git a/crates/html-extractor/tests/integration_basic.rs b/crates/html-extractor/tests/integration_basic.rs index b47840b..1269ce6 100644 --- a/crates/html-extractor/tests/integration_basic.rs +++ b/crates/html-extractor/tests/integration_basic.rs @@ -86,6 +86,40 @@ fn page_type_override_is_respected() { assert_eq!(r.page_type, PageType::Documentation); } +#[test] +fn decisions_ledger_off_by_default() { + let r = extract(SIMPLE_ARTICLE, &ExtractOptions::default()).unwrap(); + assert!(r.decisions.is_none(), "ledger must be opt-in"); +} + +#[test] +fn decisions_ledger_records_kept_root_and_dropped_chrome() { + let opts = ExtractOptions { + output_decisions: true, + ..Default::default() + }; + let r = extract(SIMPLE_ARTICLE, &opts).unwrap(); + let decisions = r.decisions.expect("ledger present when opted in"); + + // First entry is the kept main container. + let root = &decisions[0]; + assert!(root.kept, "first decision should be the kept root"); + assert!(!root.selector.is_empty()); + + // The related-stories sidebar and footer are dropped — at least one + // kept=false entry, each carrying a non-empty selector. + let drops: Vec<_> = decisions.iter().filter(|d| !d.kept).collect(); + assert!( + !drops.is_empty(), + "expected at least one dropped boilerplate block, got {decisions:?}" + ); + for d in &drops { + assert!(!d.selector.is_empty(), "drop selector should be non-empty"); + assert!((0.0..=1.0).contains(&d.confidence)); + assert!((0.0..=1.0).contains(&d.score)); + } +} + #[test] fn no_panic_on_malformed_input() { // unclosed tags, weird structure