CMU Coding Bootcamp
at private/coded/push-mttnlwyrtnss 211 lines 6.8 kB view raw
1import { SearchIndex } from "."; 2import { toRegex } from "glob-to-regex.js"; 3import { EventEmitter } from "node:events"; 4 5interface RobotUrls { 6 allows: Set<string>; 7 disallows: Set<string>; 8} 9 10export class RobotsParser { 11 disallow: Map<string, Set<string>> = new Map(); 12 allow: Map<string, Set<string>> = new Map(); 13 14 constructor(text: string) { 15 const lines = text 16 .split("\n") 17 .filter((l) => !/^\s*#.*$/.test(l)) // remove full-line comments 18 .map((l) => l.replace(/\s*#.*$/, "")); // remove end-of-line comments 19 lines.push(""); 20 21 const blocks: Array<Array<string>> = []; 22 let current_block: Array<string> = []; 23 lines.forEach((line) => { 24 if (line == "") { 25 if (current_block.length == 0) return; // ignore consecutive empty lines 26 blocks.push(current_block); 27 current_block = new Array(); 28 } else { 29 current_block.push(line); 30 } 31 }); 32 33 blocks.forEach((block) => { 34 let uas: string[] = []; 35 let disallows: string[] = []; 36 let allows: string[] = []; 37 block.forEach((line) => { 38 line = line.trim().toLowerCase(); 39 const fields: Array<string> = line.split(/\s*:\s*/); 40 if (fields.length < 2) return; 41 if (fields[0] == "user-agent") { 42 uas.push(fields[1]!); 43 } else if (fields[0] == "disallow") { 44 disallows.push(fields[1]!); 45 } else if (fields[0] == "allow") { 46 allows.push(fields[1]!); 47 } 48 }); 49 uas.forEach((ua) => { 50 ua = ua.toLowerCase(); 51 this.disallow.set( 52 ua, 53 new Set([...(this.disallow.get(ua) || []), ...disallows]), 54 ); 55 this.allow.set( 56 ua, 57 new Set([...(this.allow.get(ua) || []), ...allows]), 58 ); 59 }); 60 }); 61 } 62 63 static checkUserAgent(urls: RobotUrls, url: string): boolean { 64 const { allows, disallows } = urls; 65 const allowed = allows 66 .values() 67 .map((allow) => { 68 const regex = toRegex(allow); 69 return regex.test(url); 70 }) 71 .reduce((acc, curr) => acc || curr, false); 72 if (allowed) { 73 return true; 74 } 75 const disallowed = disallows 76 .values() 77 .map((disallow) => { 78 const regex = toRegex(disallow); 79 return regex.test(url); 80 }) 81 .reduce((acc, curr) => acc || curr, false); 82 return !disallowed; 83 } 84 85 getUrlsForUA(ua: string): RobotUrls { 86 ua = ua.toLowerCase(); 87 const allowUAs = this.allow 88 .keys() 89 .filter((key) => toRegex(key).test(ua)); 90 const disallowUAs = this.disallow 91 .keys() 92 .filter((key) => toRegex(key).test(ua)); 93 let allows = new Set<string>(); 94 let disallows = new Set<string>(); 95 96 allowUAs.forEach((ua) => { 97 const allow = this.allow.get(ua); 98 if (allow) { 99 allows = allows.union(allow); 100 } 101 }); 102 disallowUAs.forEach((ua) => { 103 const disallow = this.disallow.get(ua); 104 if (disallow) { 105 disallows = disallows.union(disallow); 106 } 107 }); 108 return { 109 allows, 110 disallows, 111 }; 112 } 113} 114 115const urlRegex = /https?:\/\/[^\s\"]+/g; 116export class Crawler extends EventEmitter { 117 private robots: Map<string, RobotUrls> = new Map(); // hostname, robots allowed and disallowed for the sepcified UA 118 private visited: Set<URL> = new Set(); // URLS 119 120 constructor( 121 private readonly UA: string, 122 public index: SearchIndex, 123 ) { 124 super(); 125 this.on("addURL", (url: URL) => { 126 console.log(`Adding URL: ${url}`); 127 void this.processPage(url); 128 }); 129 this.once("stop", () => { 130 this.removeAllListeners(); 131 }); 132 } 133 134 private async checkDisallowed(url: URL): Promise<boolean> { 135 const robots = 136 this.robots.get(url.hostname) || (await this.getRobotsTxt(url)); 137 return !RobotsParser.checkUserAgent(robots, url.toString()); 138 } 139 140 private async getRobotsTxt(url: URL): Promise<RobotUrls> { 141 const robotsTxtUrl = new URL( 142 `${url.protocol}//${url.hostname}/robots.txt`, 143 ); 144 145 const response = await fetch(robotsTxtUrl, { 146 headers: { 147 "User-Agent": this.UA, 148 }, 149 }); 150 if (response.status !== 200) 151 return { allows: new Set(), disallows: new Set() }; 152 if (!response.headers.get("content-type")?.startsWith("text/plain")) 153 return { allows: new Set(), disallows: new Set() }; 154 const robotsTxt = await response.text(); 155 const parsed = new RobotsParser(robotsTxt); 156 const forUA = parsed.getUrlsForUA(this.UA); 157 this.robots.set(url.hostname, forUA); 158 return forUA; 159 } 160 161 private async addOutlinks(html: string): Promise<void> { 162 const links = html.matchAll(urlRegex); 163 if (!links) return; 164 for (const [link, ..._] of links) { 165 console.log(link); 166 const url = new URL(link); 167 if (await this.checkDisallowed(url)) { 168 this.emit("addURL", url); 169 } 170 } 171 } 172 173 // private getText(html: string): string { 174 // const parser = new DOMParser(); 175 // const doc = parser.parseFromString(html, "text/html"); 176 // return doc.body.textContent || ""; 177 // } 178 179 private async getPage(url: URL) { 180 if (this.visited.has(url)) return; 181 if (await this.checkDisallowed(url)) return; 182 const page = await fetch(url); 183 this.visited.add(url); 184 if (!page.ok) return; 185 if (!page.headers.get("Content-Type")?.startsWith("text/html")) return; 186 187 return await page.text(); 188 } 189 190 private async processPage(url: URL) { 191 const page = await this.getPage(url); 192 if (!page) return; 193 await this.addOutlinks(page); 194 this.index.addPage(url.toString(), page); 195 this.emit("storePage", url); 196 } 197 198 crawl(url_str: string | URL) { 199 this.emit("addURL", new URL(url_str)); 200 } 201} 202 203let crawler = new Crawler("SmartFridge", new SearchIndex()); 204 205const url = new URL("https://example.com"); 206crawler.crawl(url); 207crawler.on("storePage", (url) => { 208 console.log(`Page stored: ${url}`); 209 console.log("entries:", crawler.index.size()); 210 crawler.emit("stop"); 211});