Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions crates/html-extractor-napi/__test__/binding.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,30 @@ test('conflicting options reject the promise', async () => {
)
})

test('outputDecisions returns a keep/drop ledger', async () => {
const withChrome = `
<html><body>
<main><article>
<h1>Hello World</h1>
<p>This is the first paragraph of an article, long enough to clear the extraction threshold with real prose preserved in the output.</p>
<p>A second paragraph gives the scored walk a solid main-content region to lock onto for this page.</p>
<aside class="related-stories"><a href="/a">Other story one</a><a href="/b">Other story two</a></aside>
</article></main>
<footer class="site-footer">© 2024 ExampleSite</footer>
</body></html>`
const r = await extract(withChrome, { outputDecisions: true })
assert.ok(Array.isArray(r.decisions), 'decisions should be an array')
assert.ok(r.decisions.length >= 1)
assert.equal(r.decisions[0].kept, true, 'first entry is the kept root')
for (const d of r.decisions) {
assert.equal(typeof d.selector, 'string')
assert.ok(d.confidence >= 0 && d.confidence <= 1)
}
// default: no ledger
const plain = await extract(withChrome)
assert.ok(!plain.decisions, 'decisions omitted unless opted in')
})

test('version() returns a semver-shaped string', () => {
const v = version()
assert.match(v, /^\d+\.\d+\.\d+/)
Expand Down
18 changes: 17 additions & 1 deletion crates/html-extractor-napi/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ export interface ExtractOptions {
favorRecall?: boolean
/** Include a plain-text mirror of the markdown in the result. */
outputText?: boolean
/** Reserved for Phase 4 (per-element decisions ledger). */
/**
* Populate `result.decisions`: the kept main container plus every
* boilerplate block dropped during post-clean. Off by default.
*/
outputDecisions?: boolean
/** Hint for language detection. */
targetLanguage?: string
Expand Down Expand Up @@ -62,6 +65,17 @@ export interface ExtractStats {
pageType: string
}

export interface Decision {
/** CSS-selector-shaped signature: tag + sorted classes + `#id`. */
selector: string
/** Fraction of the kept subtree's text this element held, `[0,1]`. */
score: number
/** Whether the element survived into the output. */
kept: boolean
/** Confidence in the keep/drop call, `[0,1]`. */
confidence: number
}

export interface ExtractResult {
/** Cleaned main content as GitHub-flavored markdown. */
markdown: string
Expand All @@ -77,6 +91,8 @@ export interface ExtractResult {
metadata?: Metadata
/** Internal stats useful for telemetry. */
stats?: ExtractStats
/** Keep/drop ledger when `outputDecisions: true`: kept root then dropped blocks. */
decisions?: Decision[]
/** Reason for a low-confidence / failed extraction, if any. */
errorReason?: string
}
Expand Down
21 changes: 21 additions & 0 deletions crates/html-extractor-napi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ pub struct ExtractStats {
pub page_type: String,
}

#[napi(object)]
pub struct Decision {
pub selector: String,
pub score: f64,
pub kept: bool,
pub confidence: f64,
}

#[napi(object)]
pub struct ExtractResult {
pub markdown: String,
Expand All @@ -61,6 +69,7 @@ pub struct ExtractResult {
pub language: Option<String>,
pub metadata: Option<Metadata>,
pub stats: Option<ExtractStats>,
pub decisions: Option<Vec<Decision>>,
pub error_reason: Option<String>,
}

Expand Down Expand Up @@ -141,6 +150,16 @@ fn map_result(r: html_extractor::ExtractResult, want_text: bool) -> ExtractResul
used_fallback: s.used_fallback,
page_type: s.page_type.to_string(),
});
let decisions = r.decisions.map(|ds| {
ds.into_iter()
.map(|d| Decision {
selector: d.selector,
score: d.score as f64,
kept: d.kept,
confidence: d.confidence as f64,
})
.collect()
});
ExtractResult {
markdown: r.markdown,
text,
Expand All @@ -149,6 +168,7 @@ fn map_result(r: html_extractor::ExtractResult, want_text: bool) -> ExtractResul
language: r.language,
metadata,
stats,
decisions,
error_reason: r.error_reason.map(|e| e.to_string()),
}
}
Expand Down Expand Up @@ -229,6 +249,7 @@ fn too_large_result(html_len: usize, limit: usize) -> ExtractResult {
language: None,
metadata: None,
stats: None,
decisions: None,
error_reason: Some(format!(
"input_too_large: {html_len} bytes exceeds max_input_size {limit}"
)),
Expand Down
38 changes: 35 additions & 3 deletions crates/html-extractor/src/clean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
use regex::Regex;

use crate::tree::Tree;
use crate::types::ExtractOptions;
use crate::types::{Decision, ExtractOptions};

/// Tags whose entire subtree is never content.
const KILL_TAGS: &[&str] = &[
Expand Down Expand Up @@ -131,6 +131,8 @@ fn post_clean_inner(
// descendant indices that the renderer will respect. This keeps `tree`
// shareable with other passes (e.g. for fallback retries).
let mut skip: std::collections::HashSet<usize> = std::collections::HashSet::new();
// Only allocated when the caller asked for the decisions ledger.
let mut decisions: Vec<Decision> = Vec::new();
// Subtree text/link-char metrics for the whole selection, computed in one
// post-order pass. Looking these up is O(1); the previous code called
// `full_text` per node, which re-walked each subtree (O(N²)).
Expand Down Expand Up @@ -182,20 +184,50 @@ fn post_clean_inner(
{
skip.insert(idx);
stripped_total += idx_text;
if options.output_decisions {
let link_density = if idx_text > 0 {
(metrics.link_chars[idx] as f32 / idx_text as f32).clamp(0.0, 1.0)
} else {
0.0
};
decisions.push(Decision {
selector: elem.selector(),
// Share of the kept subtree's text this dropped block held.
score: if root_text_len > 0 {
idx_text as f32 / root_text_len as f32
} else {
0.0
},
kept: false,
// An explicit chrome/share class match is a strong signal;
// a link-density-only drop is only as sure as the ratio.
confidence: if chrome_hit || share_hit {
0.9
} else {
link_density
},
});
}
// don't descend into a dropped subtree
continue;
}
for &c in &elem.children {
stack.push(c);
}
}
CleanedRoot { root, skip }
CleanedRoot {
root,
skip,
decisions,
}
}

/// Result of post-clean: a root index plus a set of subtrees to skip.
/// Result of post-clean: a root index, the set of subtrees to skip, and (when
/// `output_decisions` is set) the per-drop decisions recorded along the way.
pub(crate) struct CleanedRoot {
pub root: usize,
pub skip: std::collections::HashSet<usize>,
pub decisions: Vec<Decision>,
}

static CHROME_RE: Lazy<Regex> = Lazy::new(|| {
Expand Down
27 changes: 22 additions & 5 deletions crates/html-extractor/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ mod scoring;
mod tree;
mod types;

pub use types::{ExtractError, ExtractOptions, ExtractResult, ExtractStats, Metadata, PageType};
pub use types::{
Decision, ExtractError, ExtractOptions, ExtractResult, ExtractStats, Metadata, PageType,
};

/// Extract the main content of an HTML document, returning a structured
/// [`ExtractResult`] containing markdown, page-type, metadata, and a confidence
Expand Down Expand Up @@ -150,15 +152,30 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result<ExtractResult, Ex
};

// Stage 5: post-clean within the kept subtree, then render.
let (markdown, text_chars) = if let Some(root) = final_root {
let (markdown, text_chars, decisions) = if let Some(root) = final_root {
let cleaned = if matches!(page_type, PageType::Listing | PageType::Collection) {
clean::post_clean_lenient_links(&tree, root, options)
} else {
clean::post_clean(&tree, root, options)
};
render::render(&tree, &cleaned, options)
let (markdown, text_chars) = render::render(&tree, &cleaned, options);
// Ledger: the kept main container followed by each dropped block.
let decisions = if options.output_decisions {
let mut v = Vec::with_capacity(cleaned.decisions.len() + 1);
v.push(Decision {
selector: tree.get(root).selector(),
score: 1.0,
kept: true,
confidence: quality,
});
v.extend(cleaned.decisions);
Some(v)
} else {
None
};
(markdown, text_chars, decisions)
} else {
(String::new(), 0)
(String::new(), 0, None)
};

let stats = ExtractStats {
Expand All @@ -180,7 +197,7 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result<ExtractResult, Ex
} else {
None
},
decisions: None,
decisions,
stats: Some(stats),
error_reason: None,
})
Expand Down
22 changes: 22 additions & 0 deletions crates/html-extractor/src/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,28 @@ impl Element {
out.to_lowercase()
}

/// CSS-selector-shaped signature for this element: `tag` + sorted
/// `.class`es + `#id` (e.g. `div.related.sidebar#aside`). Used by the
/// decisions ledger; deterministic so it aggregates across pages.
pub fn selector(&self) -> String {
let mut s = self.tag.to_string();
if let Some(class) = self.attr("class") {
let mut classes: Vec<&str> = class.split_whitespace().collect();
classes.sort_unstable();
for c in classes {
s.push('.');
s.push_str(c);
}
}
if let Some(id) = self.attr("id") {
if !id.trim().is_empty() {
s.push('#');
s.push_str(id.trim());
}
}
s
}

pub fn attr(&self, name: &str) -> Option<&str> {
self.attrs
.iter()
Expand Down
25 changes: 17 additions & 8 deletions crates/html-extractor/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ pub struct ExtractOptions {
pub favor_recall: bool,
/// Include the plain-text version in the result.
pub output_text: bool,
/// Include the per-element decisions ledger (currently always `None`; see
/// `DECISIONS.md` D10).
/// Populate the per-element [`ExtractResult::decisions`] ledger: the
/// kept main container plus every boilerplate block post-clean dropped
/// inside it. Off by default — building it costs an allocation per drop.
pub output_decisions: bool,
/// Hint for language detection.
pub target_language: Option<String>,
Expand Down Expand Up @@ -116,7 +117,9 @@ pub struct ExtractResult {
pub language: Option<String>,
/// Metadata pulled from JSON-LD, OpenGraph, etc.
pub metadata: Option<Metadata>,
/// Per-element ledger (Phase 4; presently always `None`).
/// Per-element keep/drop ledger. `None` unless
/// [`ExtractOptions::output_decisions`] was set; otherwise the kept main
/// container followed by each boilerplate block post-clean dropped.
pub decisions: Option<Vec<Decision>>,
/// Stats describing what happened internally.
pub stats: Option<ExtractStats>,
Expand Down Expand Up @@ -166,16 +169,22 @@ pub struct Metadata {
pub schema_type: Option<String>,
}

/// Per-element decision (Phase 4 stub).
/// A single keep/drop decision recorded during extraction, for telemetry and
/// for the offline rule-learner to mine boilerplate-container signatures.
#[derive(Debug, Clone)]
pub struct Decision {
/// Selector identifying the element.
/// CSS-selector-shaped signature: `tag` + sorted `.class`es + `#id`
/// (e.g. `div.related.sidebar#aside`). Stable enough to aggregate across
/// pages of the same template.
pub selector: String,
/// Raw score.
/// Fraction of the kept subtree's text contained in this element, `[0, 1]`.
/// Near-zero for a small dropped widget; ~1.0 for the kept root.
pub score: f32,
/// `true` if the element was kept in the output.
/// Whether the element survived into the output. `true` for the main
/// container, `false` for each dropped boilerplate block.
pub kept: bool,
/// Confidence in `[0.0, 1.0]`.
/// Confidence in the keep/drop call, `[0, 1]`. High for explicit
/// chrome/share class matches; the link density for link-dense drops.
pub confidence: f32,
}

Expand Down
34 changes: 34 additions & 0 deletions crates/html-extractor/tests/integration_basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,40 @@ fn page_type_override_is_respected() {
assert_eq!(r.page_type, PageType::Documentation);
}

#[test]
fn decisions_ledger_off_by_default() {
let r = extract(SIMPLE_ARTICLE, &ExtractOptions::default()).unwrap();
assert!(r.decisions.is_none(), "ledger must be opt-in");
}

#[test]
fn decisions_ledger_records_kept_root_and_dropped_chrome() {
let opts = ExtractOptions {
output_decisions: true,
..Default::default()
};
let r = extract(SIMPLE_ARTICLE, &opts).unwrap();
let decisions = r.decisions.expect("ledger present when opted in");

// First entry is the kept main container.
let root = &decisions[0];
assert!(root.kept, "first decision should be the kept root");
assert!(!root.selector.is_empty());

// The related-stories sidebar and footer are dropped — at least one
// kept=false entry, each carrying a non-empty selector.
let drops: Vec<_> = decisions.iter().filter(|d| !d.kept).collect();
assert!(
!drops.is_empty(),
"expected at least one dropped boilerplate block, got {decisions:?}"
);
for d in &drops {
assert!(!d.selector.is_empty(), "drop selector should be non-empty");
assert!((0.0..=1.0).contains(&d.confidence));
assert!((0.0..=1.0).contains(&d.score));
}
}

#[test]
fn no_panic_on_malformed_input() {
// unclosed tags, weird structure
Expand Down
Loading