···153153 assert_eq!(config.strip.len(), 2);
154154 assert_eq!(config.strip_id_or_class.len(), 2);
155155 }
156156+ #[test]
157157+ fn test_parse_invalid_boolean() {
158158+ let content = "prune: perhaps";
159159+ let result = parse_config(content);
160160+ assert!(result.is_err());
161161+ match result.unwrap_err() {
162162+ Error::ConfigError(msg) => assert_eq!(msg, "Invalid boolean value: perhaps"),
163163+ _ => panic!("Expected ConfigError"),
164164+ }
165165+ }
166166+167167+ #[test]
168168+ fn test_parse_malformed_lines() {
169169+ let content = r#"
170170+title: //h1
171171+malformed line here
172172+another: valid
173173+ "#;
174174+ let config = parse_config(content).unwrap();
175175+ assert_eq!(config.title.len(), 1);
176176+ }
156177}
+69
crates/readability/src/extractor/generic.rs
···561561562562 assert!(result.body_html.contains("main article content"));
563563 }
564564+ #[test]
565565+ fn test_extract_body_simple_fallback() {
566566+ let html = r#"
567567+ <html>
568568+ <body>
569569+ <div class="article-content">
570570+ Short content.
571571+ </div>
572572+ </body>
573573+ </html>
574574+ "#;
575575+576576+ let extractor = GenericExtractor::new(html.to_string());
577577+ let document = Html::parse_document(html);
578578+ let body = extractor.extract_body_simple(&document);
579579+580580+ assert!(body.is_some());
581581+ assert!(body.unwrap().contains("Short content"));
582582+ }
583583+584584+ #[test]
585585+ fn test_extract_title_fallback_tag() {
586586+ let html = r#"
587587+ <html>
588588+ <head>
589589+ <title>Fallback Title</title>
590590+ </head>
591591+ <body></body>
592592+ </html>
593593+ "#;
594594+595595+ let extractor = GenericExtractor::new(html.to_string());
596596+ let document = Html::parse_document(html);
597597+ let title = extractor.extract_title(&document);
598598+599599+ assert_eq!(title, Some("Fallback Title".to_string()));
600600+ }
601601+602602+ #[test]
603603+ fn test_extract_date_fallback_time_element() {
604604+ let html = r#"
605605+ <html>
606606+ <body>
607607+ <time datetime="2025-12-25">Christmas 2025</time>
608608+ </body>
609609+ </html>
610610+ "#;
611611+612612+ let extractor = GenericExtractor::new(html.to_string());
613613+ let document = Html::parse_document(html);
614614+ let date = extractor.extract_date(&document);
615615+ assert_eq!(date, Some("2025-12-25".to_string()));
616616+ }
617617+618618+ #[test]
619619+ fn test_extract_date_fallback_schema() {
620620+ let html = r#"
621621+ <html>
622622+ <body>
623623+ <span itemprop="datePublished" content="2025-01-01">Jan 1st</span>
624624+ </body>
625625+ </html>
626626+ "#;
627627+628628+ let extractor = GenericExtractor::new(html.to_string());
629629+ let document = Html::parse_document(html);
630630+ let date = extractor.extract_date(&document);
631631+ assert_eq!(date, Some("2025-01-01".to_string()));
632632+ }
564633}
+22-4
crates/readability/src/extractor/scoring.rs
···237237 let document = Html::parse_fragment(html);
238238 let selector = Selector::parse("div").unwrap();
239239 let element = document.select(&selector).next().unwrap();
240240-241240 let weight = calculate_class_weight(element);
242241 assert!(weight > 0.0, "Should have positive weight for content/article classes");
243242 }
···248247 let document = Html::parse_fragment(html);
249248 let selector = Selector::parse("div").unwrap();
250249 let element = document.select(&selector).next().unwrap();
251251-252250 let weight = calculate_class_weight(element);
253251 assert!(weight < 0.0, "Should have negative weight for sidebar/comment classes");
254252 }
···259257 let document = Html::parse_fragment(html);
260258 let selector = Selector::parse("div").unwrap();
261259 let element = document.select(&selector).next().unwrap();
262262-263260 let density = calculate_link_density(element);
264261 assert!(density > 0.0 && density < 1.0, "Link density should be between 0 and 1");
265262 }
···270267 let document = Html::parse_fragment(html);
271268 let selector = Selector::parse("div").unwrap();
272269 let element = document.select(&selector).next().unwrap();
273273-274270 let density = calculate_link_density(element);
275271 assert!(
276272 density > 0.8,
···332328333329 let score = calculate_tag_score(element);
334330 assert_eq!(score, -5.0, "Nav tag should score -5");
331331+ }
332332+ #[test]
333333+ fn test_mixed_signals() {
334334+ let html = r#"<div class="sidebar article-content">Content</div>"#;
335335+ let document = Html::parse_fragment(html);
336336+ let selector = Selector::parse("div").unwrap();
337337+ let element = document.select(&selector).next().unwrap();
338338+339339+ assert!(
340340+ !is_unlikely_candidate(element),
341341+ "Mixed signals with positive pattern should be valid"
342342+ );
343343+ }
344344+345345+ #[test]
346346+ fn test_empty_link_density() {
347347+ let html = r#"<div></div>"#;
348348+ let document = Html::parse_fragment(html);
349349+ let selector = Selector::parse("div").unwrap();
350350+ let element = document.select(&selector).next().unwrap();
351351+352352+ assert_eq!(calculate_link_density(element), 0.0);
335353 }
336354}
+33-3
crates/readability/src/extractor/xpath.rs
···522522 assert!(body.contains("Main content here"));
523523 assert!(body.contains("Section Title"));
524524 }
525525+ #[test]
526526+ fn test_rebuild_void_elements() {
527527+ let html = r#"
528528+ <html>
529529+ <body>
530530+ <p>Text <br> with break</p>
531531+ <img src="test.jpg">
532532+ <div id="remove">Remove me</div>
533533+ </body>
534534+ </html>
535535+ "#;
536536+537537+ let config = SiteConfig { strip: vec!["//*[@id='remove']".to_string()], ..Default::default() };
538538+ let extractor = XPathExtractor::new(html.to_string());
539539+ let result = extractor.apply_strip_rules(html, &config).unwrap();
540540+541541+ assert!(result.contains("<br>"));
542542+ assert!(!result.contains("</br>"));
543543+ assert!(result.contains("<img src=\"test.jpg\">"));
544544+ assert!(!result.contains("</img>"));
545545+ assert!(!result.contains("Remove me"));
546546+ }
547547+548548+ #[test]
549549+ fn test_unsupported_xpath() {
550550+ let html = "<html></html>";
551551+ let extractor = XPathExtractor::new(html.to_string());
552552+ let document = Html::parse_document(html);
553553+554554+ // TODO: implement complex axis navigation
555555+ let result = extractor.evaluate_xpath(&document, "//div/following-sibling::p", false);
556556+ assert!(matches!(result, Err(Error::XPathError(_))));
557557+ }
525558}
526559527560#[test]
528561fn test_wikipedia_xpath_patterns() {
529562 let extractor = XPathExtractor::new(String::new());
530530-531531- // Wikipedia title XPath
532563 let (css, filter) = extractor.xpath_to_css_with_attr("//h1[@id='firstHeading']").unwrap();
533564 assert_eq!(css, "h1#firstHeading");
534565 assert!(filter.is_none());
535566536536- // Wikipedia body XPath (note space around =)
537567 let (css, filter) = extractor.xpath_to_css_with_attr("//div[@id = 'bodyContent']").unwrap();
538568 assert_eq!(css, "div#bodyContent");
539569 assert!(filter.is_none());
+83-49
crates/readability/tests/readability_tests.rs
···11use malfestio_readability::Readability;
22+use std::fs;
33+use std::path::PathBuf;
2433-#[tokio::test]
44-#[ignore = "requires network access"]
55-async fn test_arxiv_extraction() {
66- let url = "https://arxiv.org/abs/2009.03017";
55+fn get_test_html(filename: &str) -> Option<String> {
66+ let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
77+ path.push("tests/data");
88+ path.push(filename);
7988- let client = reqwest::Client::builder()
99- .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)")
1010- .build()
1111- .unwrap();
1010+ if path.exists() {
1111+ Some(fs::read_to_string(path).unwrap())
1212+ } else {
1313+ println!("Test data file not found: {:?}. Skipping test.", path);
1414+ None
1515+ }
1616+}
12171313- let response = client.get(url).send().await.unwrap();
1414- let html = response.text().await.unwrap();
1818+#[test]
1919+fn test_arxiv_extraction() {
2020+ let html = match get_test_html("arxiv.html") {
2121+ Some(h) => h,
2222+ None => return,
2323+ };
2424+ let url = "https://arxiv.org/abs/2009.03017";
15251626 let readability = Readability::new(html, Some(url));
1727 let article = readability.parse().unwrap();
18281929 assert!(!article.title.is_empty(), "Title should be extracted");
2020- println!("Title: {}", article.title);
3030+ assert!(article.title.contains("Non-exponentially weighted aggregation"));
21312232 assert!(!article.markdown.is_empty(), "Body/markdown should be extracted");
2333 assert!(article.markdown.len() > 50, "Abstract should have substantial content");
2424- println!("Markdown length: {} chars", article.markdown.len());
25342626- assert!(article.author.is_some(), "Author should be extracted from meta tag");
2727- println!("Author: {:?}", article.author);
2828-2929- assert!(
3030- article.published_date.is_some(),
3131- "Date should be extracted from meta tag"
3232- );
3333- println!("Date: {:?}", article.published_date);
3535+ // Arxiv meta tag uses "Lastname, Firstname" format: <meta name="citation_author" content="Alquier, Pierre" />
3636+ assert_eq!(article.author.as_deref(), Some("Alquier, Pierre"));
3737+ assert_eq!(article.published_date.as_deref(), Some("2020/09/07"));
3438}
35393636-#[tokio::test]
3737-#[ignore = "requires network access"]
3838-async fn test_wikipedia_extraction() {
4040+#[test]
4141+fn test_wikipedia_extraction() {
4242+ let html = match get_test_html("wikipedia.html") {
4343+ Some(h) => h,
4444+ None => return,
4545+ };
3946 let url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
40474141- let client = reqwest::Client::builder()
4242- .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)")
4343- .build()
4444- .unwrap();
4545-4646- let response = client.get(url).send().await.unwrap();
4747- let html = response.text().await.unwrap();
4848-4948 let readability = Readability::new(html, Some(url));
5049 let article = readability.parse().unwrap();
51505251 assert!(article.title.contains("Rust"), "Title should contain 'Rust'");
5353- println!("Title: {}", article.title);
5454-5552 assert!(
5653 article.markdown.len() > 1000,
5754 "Wikipedia article should have substantial content"
5855 );
5959- println!("Markdown length: {} chars", article.markdown.len());
60566161- // Verify strip rules worked: mw-editsection elements should be removed
6257 assert!(
6358 !article.content.contains("mw-editsection"),
6459 "Edit section elements (mw-editsection) should be stripped"
6560 );
6661}
67626868-/// Test extraction for site without specific rules (falls back to generic)
6969-#[tokio::test]
7070-#[ignore = "requires network access"]
7171-async fn test_generic_fallback_extraction() {
6363+#[test]
6464+fn test_generic_fallback_extraction() {
6565+ let html = match get_test_html("generic.html") {
6666+ Some(h) => h,
6767+ None => return,
6868+ };
7269 let url = "https://www.rust-lang.org/";
7373-7474- let client = reqwest::Client::builder()
7575- .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)")
7676- .build()
7777- .unwrap();
7878-7979- let response = client.get(url).send().await.unwrap();
8080- let html = response.text().await.unwrap();
81708271 let readability = Readability::new(html, Some(url));
8372 let article = readability.parse().unwrap();
84738574 assert!(!article.title.is_empty(), "Title should be extracted via generic");
8675 assert!(!article.markdown.is_empty(), "Content should be extracted via generic");
7676+}
87778888- println!("Title: {}", article.title);
8989- println!("Markdown length: {} chars", article.markdown.len());
7878+#[test]
7979+fn test_substack_extraction() {
8080+ let html = match get_test_html("substack.html") {
8181+ Some(h) => h,
8282+ None => return,
8383+ };
8484+ let url = "https://taibbi.substack.com/p/glenn-greenwald-on-his-resignation";
8585+8686+ let readability = Readability::new(html, Some(url));
8787+ let article = readability.parse().unwrap();
8888+8989+ assert!(!article.title.is_empty(), "Title should be extracted");
9090+ assert!(
9191+ article.title.contains("Glenn Greenwald"),
9292+ "Title should match expectation"
9393+ );
9494+}
9595+9696+#[test]
9797+fn test_theonion_extraction() {
9898+ let html = match get_test_html("theonion.html") {
9999+ Some(h) => h,
100100+ None => return,
101101+ };
102102+ let url = "https://www.theonion.com/theresa-may-narrowly-manages-to-survive-parliamentary-f-1831077604";
103103+104104+ let readability = Readability::new(html, Some(url));
105105+ let article = readability.parse().unwrap();
106106+107107+ assert!(!article.title.is_empty(), "Title should be extracted");
108108+ // The onion uses JSON-LD or meta tags usually, check if our rules caught it
109109+ // TODO: we should implement JSON-LD support
110110+}
111111+112112+#[test]
113113+fn test_readthedocs_extraction() {
114114+ let html = match get_test_html("readthedocs.html") {
115115+ Some(h) => h,
116116+ None => return,
117117+ };
118118+ let url = "http://docs.readthedocs.io/en/latest/getting_started.html";
119119+120120+ let readability = Readability::new(html, Some(url));
121121+ let article = readability.parse().unwrap();
122122+123123+ assert!(!article.title.is_empty(), "Title should be extracted");
90124}
+8
justfile
···1212start:
1313 cargo run --bin malfestio-cli start
14141515+# Fetch test data for readability tests
1616+fetch-test-data:
1717+ ./scripts/fetch_test_data.sh
1818+1919+# Run readability tests (fetches data first)
2020+test-readability: fetch-test-data
2121+ cargo test -p malfestio-readability --test readability_tests
2222+1523# Run all tests
1624test:
1725 cargo test --quiet
+24
scripts/fetch_test_data.sh
···11+#!/bin/bash
22+mkdir -p crates/readability/tests/data
33+44+fetch_if_missing() {
55+ local url="$1"
66+ local output="$2"
77+88+ if [ -f "$output" ]; then
99+ echo "Cached: $output"
1010+ else
1111+ echo "Fetching $url..."
1212+ curl -L -H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -o "$output" "$url"
1313+ fi
1414+}
1515+1616+fetch_if_missing "https://arxiv.org/abs/2009.03017" "crates/readability/tests/data/arxiv.html"
1717+fetch_if_missing "https://en.wikipedia.org/wiki/Rust_(programming_language)" "crates/readability/tests/data/wikipedia.html"
1818+fetch_if_missing "https://dougshapiro.medium.com/how-will-the-disruption-of-hollywood-play-out-42f724c921e1" "crates/readability/tests/data/medium.html"
1919+fetch_if_missing "https://taibbi.substack.com/p/glenn-greenwald-on-his-resignation" "crates/readability/tests/data/substack.html"
2020+fetch_if_missing "https://www.theonion.com/theresa-may-narrowly-manages-to-survive-parliamentary-f-1831077604" "crates/readability/tests/data/theonion.html"
2121+fetch_if_missing "http://docs.readthedocs.io/en/latest/getting_started.html" "crates/readability/tests/data/readthedocs.html"
2222+fetch_if_missing "https://www.rust-lang.org/" "crates/readability/tests/data/generic.html"
2323+2424+echo "Done."