project to map out webrings
1use thiserror::Error;
2
3use crate::utils;
4
5// simple parsing for robots.txt, no sitemap because lazy
6
7#[derive(Default, Debug, PartialEq)]
8pub struct RobotsTxt {
9 agents: std::collections::HashMap<String, Rules>,
10}
11
12#[derive(Default, Debug, Clone, PartialEq)]
13pub struct Rules {
14 allow: Vec<String>,
15 disallow: Vec<String>,
16}
17
18#[derive(Error, Debug)]
19pub enum RobotsTxtError {
20 /// a line is missing a colon, all non comment lines in robots.txt should contain a colon
21 #[error("missing colon in line: {0}")]
22 MissingColon(String),
23
24 /// a `user-agent` directive was followed by an empty value, this is invalid
25 #[error("no user agent specified on line: {0}")]
26 EmptyUserAgent(String),
27
28 /// invlaid directive, the only valid ones are `user-agent`, `allow`, `disallow` and `sitemap`
29 #[error("unknown directive \"{directive}\" on line: {line}")]
30 UnknownDirective { directive: String, line: String },
31
32 /// a rule was found without a `user-agent` directive before it somewhere INVALID!
33 #[error("no user agent was specified before rule: {0}")]
34 NoUserAgent(String),
35}
36
37impl RobotsTxt {
38 pub fn parse(input: &str) -> Result<Self, RobotsTxtError> {
39 let mut robots = Self::default();
40 let mut current_agent = (String::default(), Rules::default());
41
42 for line in input.lines() {
43 let line = line.trim();
44
45 if line.is_empty() || line.starts_with('#') {
46 continue;
47 }
48
49 let (key, value) = line
50 .split_once(':')
51 .map(|(k, v)| (k.trim().to_lowercase(), v.trim().to_string()))
52 .ok_or_else(|| RobotsTxtError::MissingColon(line.to_string()))?;
53
54 let handle_rule = |list: &mut Vec<String>| -> Result<(), RobotsTxtError> {
55 if current_agent.0.is_empty() {
56 return Err(RobotsTxtError::NoUserAgent(line.to_string()));
57 }
58
59 if value.is_empty() {
60 return Ok(());
61 }
62
63 list.push(value.to_string());
64 Ok(())
65 };
66
67 match key.as_str() {
68 "user-agent" => {
69 println!("found user agent {value}");
70 if value.is_empty() {
71 return Err(RobotsTxtError::EmptyUserAgent(line.to_string()));
72 }
73
74 if !current_agent.0.is_empty() {
75 robots
76 .agents
77 .insert(current_agent.0, current_agent.1.clone());
78 }
79
80 current_agent.0 = value.to_lowercase();
81 current_agent.1.clear();
82 }
83 "allow" => handle_rule(&mut current_agent.1.allow)?,
84 "disallow" => handle_rule(&mut current_agent.1.disallow)?,
85 "sitemap" => (),
86 _ => {
87 return Err(RobotsTxtError::UnknownDirective {
88 directive: key,
89 line: line.to_string(),
90 });
91 }
92 };
93 }
94
95 if !current_agent.0.is_empty() {
96 robots
97 .agents
98 .insert(current_agent.0, current_agent.1.clone());
99 }
100
101 Ok(robots)
102 }
103
104 /// retrive the rules for a input user agent
105 /// case insensitive, will remove everything after the `/`, and everything in `()` from the input
106 pub fn get_rules(&self, useragent: &str) -> Option<&Rules> {
107 let useragent = utils::clean_useragent(useragent);
108
109 self.agents.get(&useragent)
110 }
111
112 /// takes a useragent and a path, and tells you if it is allowed to access that path
113 pub fn is_allowed(&self, useragent: &str, path: &str) -> bool {
114 let useragent = utils::clean_useragent(useragent);
115
116 // first check agent-specific rules
117 if let Some(rules) = self.agents.get(&useragent) {
118 return rules.is_allowed(path);
119 }
120
121 // then check the wildcard rules
122 if let Some(rules) = self.agents.get("*") {
123 return rules.is_allowed(path);
124 }
125
126 true
127 }
128}
129
130impl Rules {
131 fn clear(&mut self) {
132 self.allow.clear();
133 self.disallow.clear();
134 }
135
136 pub fn allow(&self) -> &[String] {
137 &self.allow
138 }
139
140 pub fn disallow(&self) -> &[String] {
141 &self.disallow
142 }
143
144 pub fn is_allowed(&self, path: &str) -> bool {
145 let longest_match = |patterns: &[String]| {
146 patterns
147 .iter()
148 .filter_map(|p| {
149 if path.starts_with(p) {
150 Some(p.len())
151 } else {
152 None
153 }
154 })
155 .max()
156 .unwrap_or(0)
157 };
158
159 let allow_len = longest_match(&self.allow);
160 let disallow_len = longest_match(&self.disallow);
161
162 if disallow_len > allow_len {
163 false
164 } else {
165 true
166 }
167 }
168}
169
170#[cfg(test)]
171mod tests {
172 use crate::robotstxt::RobotsTxt;
173
174 #[test]
175 fn test_allow_rule_parsing() {
176 let input = "
177 user-agent: fooBot
178 allow: *
179 ";
180
181 let robots = RobotsTxt::parse(input).unwrap();
182
183 dbg!(&robots);
184
185 assert_eq!(
186 robots
187 .get_rules("fooBot/1.0")
188 .unwrap()
189 .allow()
190 .first()
191 .unwrap(),
192 &"*".to_string()
193 )
194 }
195
196 #[test]
197 fn test_disallow_rule_parsing() {
198 let input = "
199 user-agent: fooBot
200 disallow: *
201 ";
202
203 let robots = RobotsTxt::parse(input).unwrap();
204
205 dbg!(&robots);
206
207 assert_eq!(
208 robots
209 .get_rules("fooBot/1.0")
210 .unwrap()
211 .disallow()
212 .first()
213 .unwrap(),
214 &"*".to_string()
215 )
216 }
217
218 #[test]
219 fn test_combined_rule_parsing() {
220 let input = "
221 user-agent: fooBot
222 disallow: *
223 allow: /foo
224 ";
225
226 let robots = RobotsTxt::parse(input).unwrap();
227
228 dbg!(&robots);
229
230 assert_eq!(
231 robots
232 .get_rules("fooBot/1.0")
233 .unwrap()
234 .disallow()
235 .first()
236 .unwrap(),
237 &"*".to_string()
238 );
239
240 assert_eq!(
241 robots
242 .get_rules("fooBot/1.0")
243 .unwrap()
244 .allow()
245 .first()
246 .unwrap(),
247 &"/foo".to_string()
248 )
249 }
250
251 #[test]
252 fn missing_colon() {
253 let input = "
254 user-agent: FooBot
255 allow *
256 disallow: /private
257 ";
258
259 assert!(super::RobotsTxt::parse(input).is_err())
260 }
261
262 #[test]
263 fn empty_useragent() {
264 let input = "
265 user-agent:
266 allow: *
267 disallow: /private
268 ";
269
270 assert!(super::RobotsTxt::parse(input).is_err())
271 }
272
273 #[test]
274 fn unknown_directive() {
275 let input = "
276 user-agent: EvilBot
277 PLEASE-dont-go-here-evilbot: /secret-plans/
278 ";
279
280 assert!(super::RobotsTxt::parse(input).is_err())
281 }
282
283 #[test]
284 fn no_useragent() {
285 let input = "
286 allow: *
287 ";
288
289 assert!(super::RobotsTxt::parse(input).is_err())
290 }
291
292 #[test]
293 fn multiple_useragents() {
294 let input = "
295 User-agent: Googlebot
296 Disallow: /
297
298 User-agent: BotFoo
299 Disallow: /private
300
301 User-agent: FooBot
302 Disallow: /fooland
303 ";
304
305 let robots = RobotsTxt::parse(input).unwrap();
306
307 assert!(robots.get_rules("googlebot").is_some());
308 assert!(robots.get_rules("BotFoo").is_some());
309 assert!(robots.get_rules("foobot").is_some());
310 }
311
312 #[test]
313 fn empty_allow_and_disallow_rules() {
314 let input = "
315 User-agent: FooBot
316 Allow: /
317 Disallow:
318
319 User-agent: BotFoo
320 Allow:
321 Disallow: /
322 ";
323
324 let robots = RobotsTxt::parse(input).unwrap();
325
326 assert!(robots.get_rules("botfoo").is_some());
327 assert!(robots.get_rules("foobot").is_some());
328 assert_eq!(robots.get_rules("foobot").unwrap().allow().len(), 1);
329 assert_eq!(robots.get_rules("foobot").unwrap().disallow().len(), 0);
330 assert_eq!(robots.get_rules("botfoo").unwrap().allow().len(), 0);
331 assert_eq!(robots.get_rules("botfoo").unwrap().disallow().len(), 1);
332 }
333
334 #[test]
335 fn rules_is_allowed() {
336 let rules = super::Rules {
337 allow: vec!["/public".into()],
338 disallow: vec!["/".into()],
339 };
340
341 assert!(!rules.is_allowed("/private/page"));
342 assert!(rules.is_allowed("/public/info"));
343 }
344
345 #[test]
346 fn agents_is_allowed_explicit_allow() {
347 let input = "
348 user-agent: *
349 disallow: Private
350
351 user-agent: FooBot
352 Allow: /private
353 ";
354
355 let robots = RobotsTxt::parse(input).unwrap();
356
357 assert!(robots.is_allowed("foobot", "/private"))
358 }
359
360 #[test]
361 fn agents_is_allowed_explicit_disallow() {
362 let input = "
363 user-agent: *
364 Allow: /private
365
366 user-agent: FooBot
367 Disallow: /private
368 ";
369
370 let robots = RobotsTxt::parse(input).unwrap();
371
372 assert!(!robots.is_allowed("foobot", "/private"))
373 }
374
375 #[test]
376 fn agents_is_allowed_fallback_to_wildcard() {
377 let input = "
378 user-agent: *
379 Allow: /private
380 ";
381
382 let robots = RobotsTxt::parse(input).unwrap();
383
384 assert!(robots.is_allowed("foobot", "/private"))
385 }
386
387 #[test]
388 fn agents_is_allowed_empty_robots() {
389 let robots = RobotsTxt::default();
390
391 assert!(robots.is_allowed("foobot", "/private"))
392 }
393}