feat: add more rulesets · desertthunder.dev/malfestio@e8c749e

+3

.gitignore

··· 22 22 .env 23 23 24 24 .sandbox 25 + 26 + # Test data 27 + crates/readability/tests/data/

+3

crates/readability/rules/.readthedocs.io.txt

··· 1 + title: //h1 2 + body: //div[@role='main'] 3 + test_url: http://docs.readthedocs.io/en/latest/getting_started.html

+5

crates/readability/rules/.stanford.edu.txt

··· 1 + title: //div[@id='aueditable']/h1 2 + body: //div[@id='content'] 3 + strip: //div[@id='message' or @id='linklist'] 4 + prune: no 5 + test_url: http://plato.stanford.edu/entries/supervenience/

+25

crates/readability/rules/.substack.com.txt

··· 1 + author: //meta[@name="author"]/@content 2 + title: //meta[@property="og:title"]/@content 3 + body: //h3[contains(concat(' ',normalize-space(@class),' '),' subtitle ')] | //div[contains(concat(' ',normalize-space(@class),' '),' body ')] 4 + 5 + # Clean Twitter embeds 6 + strip: //div[contains(@class, 'tweet-footer')]//span 7 + strip_id_or_class: expanded-link-description 8 + strip_id_or_class: expanded-link-domain 9 + 10 + strip_id_or_class: header-anchor-widget 11 + strip_id_or_class: subscribe-widget 12 + 13 + strip: //button 14 + strip: //svg 15 + strip: //p[contains(concat(' ',normalize-space(@class),' '),' button-wrapper ')] 16 + 17 + wrap_in(blockquote): //div[@class='tweet'] 18 + 19 + 20 + prune: no 21 + 22 + test_url: https://taibbi.substack.com/p/glenn-greenwald-on-his-resignation 23 + test_contains: Greenwald, by then furious, noted that neither Maass nor Reed had identified a factual inaccuracy 24 + test_url: https://jonathancook.substack.com/p/why-the-western-media-is-afraid-of 25 + test_contains: The goal of the corporate media is not unearthing truth

+10

crates/readability/rules/.theonion.com.txt

··· 1 + title: //h2[@class='title'] | //h1[contains(concat(' ',normalize-space(@class),' '),'headline')] 2 + date: substring-before(//p[@class='meta'], '|') 3 + body: //div[@class='article_body'] | //div[@class='story'] | //div[contains(concat(' ',normalize-space(@class),' '),'post-content')] 4 + 5 + strip: //h2[@class='title'] 6 + strip: //p[@class='meta'] 7 + strip: //div[@class='ga_section'] 8 + strip: //div[@id='recent_slider'] 9 + 10 + test_url: https://politics.theonion.com/inconsolable-jeff-sessions-tries-to-commit-suicide-by-s-1826462420

+6

crates/readability/rules/theonion.com.txt

··· 1 + title: //head/title 2 + author: //meta[@name="author"]/@content 3 + body: //div[contains(@class, 'js_post-content')] 4 + strip: //div[contains(@class, 'content-summary')] 5 + 6 + test_url: https://www.theonion.com/theresa-may-narrowly-manages-to-survive-parliamentary-f-1831077604

+21

crates/readability/src/config/parser.rs

··· 153 153 assert_eq!(config.strip.len(), 2); 154 154 assert_eq!(config.strip_id_or_class.len(), 2); 155 155 } 156 + #[test] 157 + fn test_parse_invalid_boolean() { 158 + let content = "prune: perhaps"; 159 + let result = parse_config(content); 160 + assert!(result.is_err()); 161 + match result.unwrap_err() { 162 + Error::ConfigError(msg) => assert_eq!(msg, "Invalid boolean value: perhaps"), 163 + _ => panic!("Expected ConfigError"), 164 + } 165 + } 166 + 167 + #[test] 168 + fn test_parse_malformed_lines() { 169 + let content = r#" 170 + title: //h1 171 + malformed line here 172 + another: valid 173 + "#; 174 + let config = parse_config(content).unwrap(); 175 + assert_eq!(config.title.len(), 1); 176 + } 156 177 }

+69

crates/readability/src/extractor/generic.rs

··· 561 561 562 562 assert!(result.body_html.contains("main article content")); 563 563 } 564 + #[test] 565 + fn test_extract_body_simple_fallback() { 566 + let html = r#" 567 + <html> 568 + <body> 569 + <div class="article-content"> 570 + Short content. 571 + </div> 572 + </body> 573 + </html> 574 + "#; 575 + 576 + let extractor = GenericExtractor::new(html.to_string()); 577 + let document = Html::parse_document(html); 578 + let body = extractor.extract_body_simple(&document); 579 + 580 + assert!(body.is_some()); 581 + assert!(body.unwrap().contains("Short content")); 582 + } 583 + 584 + #[test] 585 + fn test_extract_title_fallback_tag() { 586 + let html = r#" 587 + <html> 588 + <head> 589 + <title>Fallback Title</title> 590 + </head> 591 + <body></body> 592 + </html> 593 + "#; 594 + 595 + let extractor = GenericExtractor::new(html.to_string()); 596 + let document = Html::parse_document(html); 597 + let title = extractor.extract_title(&document); 598 + 599 + assert_eq!(title, Some("Fallback Title".to_string())); 600 + } 601 + 602 + #[test] 603 + fn test_extract_date_fallback_time_element() { 604 + let html = r#" 605 + <html> 606 + <body> 607 + <time datetime="2025-12-25">Christmas 2025</time> 608 + </body> 609 + </html> 610 + "#; 611 + 612 + let extractor = GenericExtractor::new(html.to_string()); 613 + let document = Html::parse_document(html); 614 + let date = extractor.extract_date(&document); 615 + assert_eq!(date, Some("2025-12-25".to_string())); 616 + } 617 + 618 + #[test] 619 + fn test_extract_date_fallback_schema() { 620 + let html = r#" 621 + <html> 622 + <body> 623 + <span itemprop="datePublished" content="2025-01-01">Jan 1st</span> 624 + </body> 625 + </html> 626 + "#; 627 + 628 + let extractor = GenericExtractor::new(html.to_string()); 629 + let document = Html::parse_document(html); 630 + let date = extractor.extract_date(&document); 631 + assert_eq!(date, Some("2025-01-01".to_string())); 632 + } 564 633 }

+22 -4

crates/readability/src/extractor/scoring.rs

··· 237 237 let document = Html::parse_fragment(html); 238 238 let selector = Selector::parse("div").unwrap(); 239 239 let element = document.select(&selector).next().unwrap(); 240 - 241 240 let weight = calculate_class_weight(element); 242 241 assert!(weight > 0.0, "Should have positive weight for content/article classes"); 243 242 } ··· 248 247 let document = Html::parse_fragment(html); 249 248 let selector = Selector::parse("div").unwrap(); 250 249 let element = document.select(&selector).next().unwrap(); 251 - 252 250 let weight = calculate_class_weight(element); 253 251 assert!(weight < 0.0, "Should have negative weight for sidebar/comment classes"); 254 252 } ··· 259 257 let document = Html::parse_fragment(html); 260 258 let selector = Selector::parse("div").unwrap(); 261 259 let element = document.select(&selector).next().unwrap(); 262 - 263 260 let density = calculate_link_density(element); 264 261 assert!(density > 0.0 && density < 1.0, "Link density should be between 0 and 1"); 265 262 } ··· 270 267 let document = Html::parse_fragment(html); 271 268 let selector = Selector::parse("div").unwrap(); 272 269 let element = document.select(&selector).next().unwrap(); 273 - 274 270 let density = calculate_link_density(element); 275 271 assert!( 276 272 density > 0.8, ··· 332 328 333 329 let score = calculate_tag_score(element); 334 330 assert_eq!(score, -5.0, "Nav tag should score -5"); 331 + } 332 + #[test] 333 + fn test_mixed_signals() { 334 + let html = r#"<div class="sidebar article-content">Content</div>"#; 335 + let document = Html::parse_fragment(html); 336 + let selector = Selector::parse("div").unwrap(); 337 + let element = document.select(&selector).next().unwrap(); 338 + 339 + assert!( 340 + !is_unlikely_candidate(element), 341 + "Mixed signals with positive pattern should be valid" 342 + ); 343 + } 344 + 345 + #[test] 346 + fn test_empty_link_density() { 347 + let html = r#"<div></div>"#; 348 + let document = Html::parse_fragment(html); 349 + let selector = Selector::parse("div").unwrap(); 350 + let element = document.select(&selector).next().unwrap(); 351 + 352 + assert_eq!(calculate_link_density(element), 0.0); 335 353 } 336 354 }

+33 -3

crates/readability/src/extractor/xpath.rs

··· 522 522 assert!(body.contains("Main content here")); 523 523 assert!(body.contains("Section Title")); 524 524 } 525 + #[test] 526 + fn test_rebuild_void_elements() { 527 + let html = r#" 528 + <html> 529 + <body> 530 + <p>Text <br> with break</p> 531 + <img src="test.jpg"> 532 + <div id="remove">Remove me</div> 533 + </body> 534 + </html> 535 + "#; 536 + 537 + let config = SiteConfig { strip: vec!["//*[@id='remove']".to_string()], ..Default::default() }; 538 + let extractor = XPathExtractor::new(html.to_string()); 539 + let result = extractor.apply_strip_rules(html, &config).unwrap(); 540 + 541 + assert!(result.contains("<br>")); 542 + assert!(!result.contains("</br>")); 543 + assert!(result.contains("<img src=\"test.jpg\">")); 544 + assert!(!result.contains("</img>")); 545 + assert!(!result.contains("Remove me")); 546 + } 547 + 548 + #[test] 549 + fn test_unsupported_xpath() { 550 + let html = "<html></html>"; 551 + let extractor = XPathExtractor::new(html.to_string()); 552 + let document = Html::parse_document(html); 553 + 554 + // TODO: implement complex axis navigation 555 + let result = extractor.evaluate_xpath(&document, "//div/following-sibling::p", false); 556 + assert!(matches!(result, Err(Error::XPathError(_)))); 557 + } 525 558 } 526 559 527 560 #[test] 528 561 fn test_wikipedia_xpath_patterns() { 529 562 let extractor = XPathExtractor::new(String::new()); 530 - 531 - // Wikipedia title XPath 532 563 let (css, filter) = extractor.xpath_to_css_with_attr("//h1[@id='firstHeading']").unwrap(); 533 564 assert_eq!(css, "h1#firstHeading"); 534 565 assert!(filter.is_none()); 535 566 536 - // Wikipedia body XPath (note space around =) 537 567 let (css, filter) = extractor.xpath_to_css_with_attr("//div[@id = 'bodyContent']").unwrap(); 538 568 assert_eq!(css, "div#bodyContent"); 539 569 assert!(filter.is_none());

+83 -49

crates/readability/tests/readability_tests.rs

··· 1 1 use malfestio_readability::Readability; 2 + use std::fs; 3 + use std::path::PathBuf; 2 4 3 - #[tokio::test] 4 - #[ignore = "requires network access"] 5 - async fn test_arxiv_extraction() { 6 - let url = "https://arxiv.org/abs/2009.03017"; 5 + fn get_test_html(filename: &str) -> Option<String> { 6 + let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); 7 + path.push("tests/data"); 8 + path.push(filename); 7 9 8 - let client = reqwest::Client::builder() 9 - .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)") 10 - .build() 11 - .unwrap(); 10 + if path.exists() { 11 + Some(fs::read_to_string(path).unwrap()) 12 + } else { 13 + println!("Test data file not found: {:?}. Skipping test.", path); 14 + None 15 + } 16 + } 12 17 13 - let response = client.get(url).send().await.unwrap(); 14 - let html = response.text().await.unwrap(); 18 + #[test] 19 + fn test_arxiv_extraction() { 20 + let html = match get_test_html("arxiv.html") { 21 + Some(h) => h, 22 + None => return, 23 + }; 24 + let url = "https://arxiv.org/abs/2009.03017"; 15 25 16 26 let readability = Readability::new(html, Some(url)); 17 27 let article = readability.parse().unwrap(); 18 28 19 29 assert!(!article.title.is_empty(), "Title should be extracted"); 20 - println!("Title: {}", article.title); 30 + assert!(article.title.contains("Non-exponentially weighted aggregation")); 21 31 22 32 assert!(!article.markdown.is_empty(), "Body/markdown should be extracted"); 23 33 assert!(article.markdown.len() > 50, "Abstract should have substantial content"); 24 - println!("Markdown length: {} chars", article.markdown.len()); 25 34 26 - assert!(article.author.is_some(), "Author should be extracted from meta tag"); 27 - println!("Author: {:?}", article.author); 28 - 29 - assert!( 30 - article.published_date.is_some(), 31 - "Date should be extracted from meta tag" 32 - ); 33 - println!("Date: {:?}", article.published_date); 35 + // Arxiv meta tag uses "Lastname, Firstname" format: <meta name="citation_author" content="Alquier, Pierre" /> 36 + assert_eq!(article.author.as_deref(), Some("Alquier, Pierre")); 37 + assert_eq!(article.published_date.as_deref(), Some("2020/09/07")); 34 38 } 35 39 36 - #[tokio::test] 37 - #[ignore = "requires network access"] 38 - async fn test_wikipedia_extraction() { 40 + #[test] 41 + fn test_wikipedia_extraction() { 42 + let html = match get_test_html("wikipedia.html") { 43 + Some(h) => h, 44 + None => return, 45 + }; 39 46 let url = "https://en.wikipedia.org/wiki/Rust_(programming_language)"; 40 47 41 - let client = reqwest::Client::builder() 42 - .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)") 43 - .build() 44 - .unwrap(); 45 - 46 - let response = client.get(url).send().await.unwrap(); 47 - let html = response.text().await.unwrap(); 48 - 49 48 let readability = Readability::new(html, Some(url)); 50 49 let article = readability.parse().unwrap(); 51 50 52 51 assert!(article.title.contains("Rust"), "Title should contain 'Rust'"); 53 - println!("Title: {}", article.title); 54 - 55 52 assert!( 56 53 article.markdown.len() > 1000, 57 54 "Wikipedia article should have substantial content" 58 55 ); 59 - println!("Markdown length: {} chars", article.markdown.len()); 60 56 61 - // Verify strip rules worked: mw-editsection elements should be removed 62 57 assert!( 63 58 !article.content.contains("mw-editsection"), 64 59 "Edit section elements (mw-editsection) should be stripped" 65 60 ); 66 61 } 67 62 68 - /// Test extraction for site without specific rules (falls back to generic) 69 - #[tokio::test] 70 - #[ignore = "requires network access"] 71 - async fn test_generic_fallback_extraction() { 63 + #[test] 64 + fn test_generic_fallback_extraction() { 65 + let html = match get_test_html("generic.html") { 66 + Some(h) => h, 67 + None => return, 68 + }; 72 69 let url = "https://www.rust-lang.org/"; 73 - 74 - let client = reqwest::Client::builder() 75 - .user_agent("Mozilla/5.0 (compatible; MalfestioBot/1.0)") 76 - .build() 77 - .unwrap(); 78 - 79 - let response = client.get(url).send().await.unwrap(); 80 - let html = response.text().await.unwrap(); 81 70 82 71 let readability = Readability::new(html, Some(url)); 83 72 let article = readability.parse().unwrap(); 84 73 85 74 assert!(!article.title.is_empty(), "Title should be extracted via generic"); 86 75 assert!(!article.markdown.is_empty(), "Content should be extracted via generic"); 76 + } 87 77 88 - println!("Title: {}", article.title); 89 - println!("Markdown length: {} chars", article.markdown.len()); 78 + #[test] 79 + fn test_substack_extraction() { 80 + let html = match get_test_html("substack.html") { 81 + Some(h) => h, 82 + None => return, 83 + }; 84 + let url = "https://taibbi.substack.com/p/glenn-greenwald-on-his-resignation"; 85 + 86 + let readability = Readability::new(html, Some(url)); 87 + let article = readability.parse().unwrap(); 88 + 89 + assert!(!article.title.is_empty(), "Title should be extracted"); 90 + assert!( 91 + article.title.contains("Glenn Greenwald"), 92 + "Title should match expectation" 93 + ); 94 + } 95 + 96 + #[test] 97 + fn test_theonion_extraction() { 98 + let html = match get_test_html("theonion.html") { 99 + Some(h) => h, 100 + None => return, 101 + }; 102 + let url = "https://www.theonion.com/theresa-may-narrowly-manages-to-survive-parliamentary-f-1831077604"; 103 + 104 + let readability = Readability::new(html, Some(url)); 105 + let article = readability.parse().unwrap(); 106 + 107 + assert!(!article.title.is_empty(), "Title should be extracted"); 108 + // The onion uses JSON-LD or meta tags usually, check if our rules caught it 109 + // TODO: we should implement JSON-LD support 110 + } 111 + 112 + #[test] 113 + fn test_readthedocs_extraction() { 114 + let html = match get_test_html("readthedocs.html") { 115 + Some(h) => h, 116 + None => return, 117 + }; 118 + let url = "http://docs.readthedocs.io/en/latest/getting_started.html"; 119 + 120 + let readability = Readability::new(html, Some(url)); 121 + let article = readability.parse().unwrap(); 122 + 123 + assert!(!article.title.is_empty(), "Title should be extracted"); 90 124 }

+8

justfile

··· 12 12 start: 13 13 cargo run --bin malfestio-cli start 14 14 15 + # Fetch test data for readability tests 16 + fetch-test-data: 17 + ./scripts/fetch_test_data.sh 18 + 19 + # Run readability tests (fetches data first) 20 + test-readability: fetch-test-data 21 + cargo test -p malfestio-readability --test readability_tests 22 + 15 23 # Run all tests 16 24 test: 17 25 cargo test --quiet

+24

scripts/fetch_test_data.sh

··· 1 + #!/bin/bash 2 + mkdir -p crates/readability/tests/data 3 + 4 + fetch_if_missing() { 5 + local url="$1" 6 + local output="$2" 7 + 8 + if [ -f "$output" ]; then 9 + echo "Cached: $output" 10 + else 11 + echo "Fetching $url..." 12 + curl -L -H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -o "$output" "$url" 13 + fi 14 + } 15 + 16 + fetch_if_missing "https://arxiv.org/abs/2009.03017" "crates/readability/tests/data/arxiv.html" 17 + fetch_if_missing "https://en.wikipedia.org/wiki/Rust_(programming_language)" "crates/readability/tests/data/wikipedia.html" 18 + fetch_if_missing "https://dougshapiro.medium.com/how-will-the-disruption-of-hollywood-play-out-42f724c921e1" "crates/readability/tests/data/medium.html" 19 + fetch_if_missing "https://taibbi.substack.com/p/glenn-greenwald-on-his-resignation" "crates/readability/tests/data/substack.html" 20 + fetch_if_missing "https://www.theonion.com/theresa-may-narrowly-manages-to-survive-parliamentary-f-1831077604" "crates/readability/tests/data/theonion.html" 21 + fetch_if_missing "http://docs.readthedocs.io/en/latest/getting_started.html" "crates/readability/tests/data/readthedocs.html" 22 + fetch_if_missing "https://www.rust-lang.org/" "crates/readability/tests/data/generic.html" 23 + 24 + echo "Done."