project to map out webrings
at main 393 lines 10 kB view raw
1use thiserror::Error; 2 3use crate::utils; 4 5// simple parsing for robots.txt, no sitemap because lazy 6 7#[derive(Default, Debug, PartialEq)] 8pub struct RobotsTxt { 9 agents: std::collections::HashMap<String, Rules>, 10} 11 12#[derive(Default, Debug, Clone, PartialEq)] 13pub struct Rules { 14 allow: Vec<String>, 15 disallow: Vec<String>, 16} 17 18#[derive(Error, Debug)] 19pub enum RobotsTxtError { 20 /// a line is missing a colon, all non comment lines in robots.txt should contain a colon 21 #[error("missing colon in line: {0}")] 22 MissingColon(String), 23 24 /// a `user-agent` directive was followed by an empty value, this is invalid 25 #[error("no user agent specified on line: {0}")] 26 EmptyUserAgent(String), 27 28 /// invlaid directive, the only valid ones are `user-agent`, `allow`, `disallow` and `sitemap` 29 #[error("unknown directive \"{directive}\" on line: {line}")] 30 UnknownDirective { directive: String, line: String }, 31 32 /// a rule was found without a `user-agent` directive before it somewhere INVALID! 33 #[error("no user agent was specified before rule: {0}")] 34 NoUserAgent(String), 35} 36 37impl RobotsTxt { 38 pub fn parse(input: &str) -> Result<Self, RobotsTxtError> { 39 let mut robots = Self::default(); 40 let mut current_agent = (String::default(), Rules::default()); 41 42 for line in input.lines() { 43 let line = line.trim(); 44 45 if line.is_empty() || line.starts_with('#') { 46 continue; 47 } 48 49 let (key, value) = line 50 .split_once(':') 51 .map(|(k, v)| (k.trim().to_lowercase(), v.trim().to_string())) 52 .ok_or_else(|| RobotsTxtError::MissingColon(line.to_string()))?; 53 54 let handle_rule = |list: &mut Vec<String>| -> Result<(), RobotsTxtError> { 55 if current_agent.0.is_empty() { 56 return Err(RobotsTxtError::NoUserAgent(line.to_string())); 57 } 58 59 if value.is_empty() { 60 return Ok(()); 61 } 62 63 list.push(value.to_string()); 64 Ok(()) 65 }; 66 67 match key.as_str() { 68 "user-agent" => { 69 println!("found user agent {value}"); 70 if value.is_empty() { 71 return Err(RobotsTxtError::EmptyUserAgent(line.to_string())); 72 } 73 74 if !current_agent.0.is_empty() { 75 robots 76 .agents 77 .insert(current_agent.0, current_agent.1.clone()); 78 } 79 80 current_agent.0 = value.to_lowercase(); 81 current_agent.1.clear(); 82 } 83 "allow" => handle_rule(&mut current_agent.1.allow)?, 84 "disallow" => handle_rule(&mut current_agent.1.disallow)?, 85 "sitemap" => (), 86 _ => { 87 return Err(RobotsTxtError::UnknownDirective { 88 directive: key, 89 line: line.to_string(), 90 }); 91 } 92 }; 93 } 94 95 if !current_agent.0.is_empty() { 96 robots 97 .agents 98 .insert(current_agent.0, current_agent.1.clone()); 99 } 100 101 Ok(robots) 102 } 103 104 /// retrive the rules for a input user agent 105 /// case insensitive, will remove everything after the `/`, and everything in `()` from the input 106 pub fn get_rules(&self, useragent: &str) -> Option<&Rules> { 107 let useragent = utils::clean_useragent(useragent); 108 109 self.agents.get(&useragent) 110 } 111 112 /// takes a useragent and a path, and tells you if it is allowed to access that path 113 pub fn is_allowed(&self, useragent: &str, path: &str) -> bool { 114 let useragent = utils::clean_useragent(useragent); 115 116 // first check agent-specific rules 117 if let Some(rules) = self.agents.get(&useragent) { 118 return rules.is_allowed(path); 119 } 120 121 // then check the wildcard rules 122 if let Some(rules) = self.agents.get("*") { 123 return rules.is_allowed(path); 124 } 125 126 true 127 } 128} 129 130impl Rules { 131 fn clear(&mut self) { 132 self.allow.clear(); 133 self.disallow.clear(); 134 } 135 136 pub fn allow(&self) -> &[String] { 137 &self.allow 138 } 139 140 pub fn disallow(&self) -> &[String] { 141 &self.disallow 142 } 143 144 pub fn is_allowed(&self, path: &str) -> bool { 145 let longest_match = |patterns: &[String]| { 146 patterns 147 .iter() 148 .filter_map(|p| { 149 if path.starts_with(p) { 150 Some(p.len()) 151 } else { 152 None 153 } 154 }) 155 .max() 156 .unwrap_or(0) 157 }; 158 159 let allow_len = longest_match(&self.allow); 160 let disallow_len = longest_match(&self.disallow); 161 162 if disallow_len > allow_len { 163 false 164 } else { 165 true 166 } 167 } 168} 169 170#[cfg(test)] 171mod tests { 172 use crate::robotstxt::RobotsTxt; 173 174 #[test] 175 fn test_allow_rule_parsing() { 176 let input = " 177 user-agent: fooBot 178 allow: * 179 "; 180 181 let robots = RobotsTxt::parse(input).unwrap(); 182 183 dbg!(&robots); 184 185 assert_eq!( 186 robots 187 .get_rules("fooBot/1.0") 188 .unwrap() 189 .allow() 190 .first() 191 .unwrap(), 192 &"*".to_string() 193 ) 194 } 195 196 #[test] 197 fn test_disallow_rule_parsing() { 198 let input = " 199 user-agent: fooBot 200 disallow: * 201 "; 202 203 let robots = RobotsTxt::parse(input).unwrap(); 204 205 dbg!(&robots); 206 207 assert_eq!( 208 robots 209 .get_rules("fooBot/1.0") 210 .unwrap() 211 .disallow() 212 .first() 213 .unwrap(), 214 &"*".to_string() 215 ) 216 } 217 218 #[test] 219 fn test_combined_rule_parsing() { 220 let input = " 221 user-agent: fooBot 222 disallow: * 223 allow: /foo 224 "; 225 226 let robots = RobotsTxt::parse(input).unwrap(); 227 228 dbg!(&robots); 229 230 assert_eq!( 231 robots 232 .get_rules("fooBot/1.0") 233 .unwrap() 234 .disallow() 235 .first() 236 .unwrap(), 237 &"*".to_string() 238 ); 239 240 assert_eq!( 241 robots 242 .get_rules("fooBot/1.0") 243 .unwrap() 244 .allow() 245 .first() 246 .unwrap(), 247 &"/foo".to_string() 248 ) 249 } 250 251 #[test] 252 fn missing_colon() { 253 let input = " 254 user-agent: FooBot 255 allow * 256 disallow: /private 257 "; 258 259 assert!(super::RobotsTxt::parse(input).is_err()) 260 } 261 262 #[test] 263 fn empty_useragent() { 264 let input = " 265 user-agent: 266 allow: * 267 disallow: /private 268 "; 269 270 assert!(super::RobotsTxt::parse(input).is_err()) 271 } 272 273 #[test] 274 fn unknown_directive() { 275 let input = " 276 user-agent: EvilBot 277 PLEASE-dont-go-here-evilbot: /secret-plans/ 278 "; 279 280 assert!(super::RobotsTxt::parse(input).is_err()) 281 } 282 283 #[test] 284 fn no_useragent() { 285 let input = " 286 allow: * 287 "; 288 289 assert!(super::RobotsTxt::parse(input).is_err()) 290 } 291 292 #[test] 293 fn multiple_useragents() { 294 let input = " 295 User-agent: Googlebot 296 Disallow: / 297 298 User-agent: BotFoo 299 Disallow: /private 300 301 User-agent: FooBot 302 Disallow: /fooland 303 "; 304 305 let robots = RobotsTxt::parse(input).unwrap(); 306 307 assert!(robots.get_rules("googlebot").is_some()); 308 assert!(robots.get_rules("BotFoo").is_some()); 309 assert!(robots.get_rules("foobot").is_some()); 310 } 311 312 #[test] 313 fn empty_allow_and_disallow_rules() { 314 let input = " 315 User-agent: FooBot 316 Allow: / 317 Disallow: 318 319 User-agent: BotFoo 320 Allow: 321 Disallow: / 322 "; 323 324 let robots = RobotsTxt::parse(input).unwrap(); 325 326 assert!(robots.get_rules("botfoo").is_some()); 327 assert!(robots.get_rules("foobot").is_some()); 328 assert_eq!(robots.get_rules("foobot").unwrap().allow().len(), 1); 329 assert_eq!(robots.get_rules("foobot").unwrap().disallow().len(), 0); 330 assert_eq!(robots.get_rules("botfoo").unwrap().allow().len(), 0); 331 assert_eq!(robots.get_rules("botfoo").unwrap().disallow().len(), 1); 332 } 333 334 #[test] 335 fn rules_is_allowed() { 336 let rules = super::Rules { 337 allow: vec!["/public".into()], 338 disallow: vec!["/".into()], 339 }; 340 341 assert!(!rules.is_allowed("/private/page")); 342 assert!(rules.is_allowed("/public/info")); 343 } 344 345 #[test] 346 fn agents_is_allowed_explicit_allow() { 347 let input = " 348 user-agent: * 349 disallow: Private 350 351 user-agent: FooBot 352 Allow: /private 353 "; 354 355 let robots = RobotsTxt::parse(input).unwrap(); 356 357 assert!(robots.is_allowed("foobot", "/private")) 358 } 359 360 #[test] 361 fn agents_is_allowed_explicit_disallow() { 362 let input = " 363 user-agent: * 364 Allow: /private 365 366 user-agent: FooBot 367 Disallow: /private 368 "; 369 370 let robots = RobotsTxt::parse(input).unwrap(); 371 372 assert!(!robots.is_allowed("foobot", "/private")) 373 } 374 375 #[test] 376 fn agents_is_allowed_fallback_to_wildcard() { 377 let input = " 378 user-agent: * 379 Allow: /private 380 "; 381 382 let robots = RobotsTxt::parse(input).unwrap(); 383 384 assert!(robots.is_allowed("foobot", "/private")) 385 } 386 387 #[test] 388 fn agents_is_allowed_empty_robots() { 389 let robots = RobotsTxt::default(); 390 391 assert!(robots.is_allowed("foobot", "/private")) 392 } 393}