CMU Coding Bootcamp
1import { SearchIndex } from ".";
2import { toRegex } from "glob-to-regex.js";
3import { EventEmitter } from "node:events";
4
5interface RobotUrls {
6 allows: Set<string>;
7 disallows: Set<string>;
8}
9
10export class RobotsParser {
11 disallow: Map<string, Set<string>> = new Map();
12 allow: Map<string, Set<string>> = new Map();
13
14 constructor(text: string) {
15 const lines = text
16 .split("\n")
17 .filter((l) => !/^\s*#.*$/.test(l)) // remove full-line comments
18 .map((l) => l.replace(/\s*#.*$/, "")); // remove end-of-line comments
19 lines.push("");
20
21 const blocks: Array<Array<string>> = [];
22 let current_block: Array<string> = [];
23 lines.forEach((line) => {
24 if (line == "") {
25 if (current_block.length == 0) return; // ignore consecutive empty lines
26 blocks.push(current_block);
27 current_block = new Array();
28 } else {
29 current_block.push(line);
30 }
31 });
32
33 blocks.forEach((block) => {
34 let uas: string[] = [];
35 let disallows: string[] = [];
36 let allows: string[] = [];
37 block.forEach((line) => {
38 line = line.trim().toLowerCase();
39 const fields: Array<string> = line.split(/\s*:\s*/);
40 if (fields.length < 2) return;
41 if (fields[0] == "user-agent") {
42 uas.push(fields[1]!);
43 } else if (fields[0] == "disallow") {
44 disallows.push(fields[1]!);
45 } else if (fields[0] == "allow") {
46 allows.push(fields[1]!);
47 }
48 });
49 uas.forEach((ua) => {
50 ua = ua.toLowerCase();
51 this.disallow.set(
52 ua,
53 new Set([...(this.disallow.get(ua) || []), ...disallows]),
54 );
55 this.allow.set(
56 ua,
57 new Set([...(this.allow.get(ua) || []), ...allows]),
58 );
59 });
60 });
61 }
62
63 static checkUserAgent(urls: RobotUrls, url: string): boolean {
64 const { allows, disallows } = urls;
65 const allowed = allows
66 .values()
67 .map((allow) => {
68 const regex = toRegex(allow);
69 return regex.test(url);
70 })
71 .reduce((acc, curr) => acc || curr, false);
72 if (allowed) {
73 return true;
74 }
75 const disallowed = disallows
76 .values()
77 .map((disallow) => {
78 const regex = toRegex(disallow);
79 return regex.test(url);
80 })
81 .reduce((acc, curr) => acc || curr, false);
82 return !disallowed;
83 }
84
85 getUrlsForUA(ua: string): RobotUrls {
86 ua = ua.toLowerCase();
87 const allowUAs = this.allow
88 .keys()
89 .filter((key) => toRegex(key).test(ua));
90 const disallowUAs = this.disallow
91 .keys()
92 .filter((key) => toRegex(key).test(ua));
93 let allows = new Set<string>();
94 let disallows = new Set<string>();
95
96 allowUAs.forEach((ua) => {
97 const allow = this.allow.get(ua);
98 if (allow) {
99 allows = allows.union(allow);
100 }
101 });
102 disallowUAs.forEach((ua) => {
103 const disallow = this.disallow.get(ua);
104 if (disallow) {
105 disallows = disallows.union(disallow);
106 }
107 });
108 return {
109 allows,
110 disallows,
111 };
112 }
113}
114
115const urlRegex = /https?:\/\/[^\s\"]+/g;
116export class Crawler extends EventEmitter {
117 private robots: Map<string, RobotUrls> = new Map(); // hostname, robots allowed and disallowed for the sepcified UA
118 private visited: Set<URL> = new Set(); // URLS
119
120 constructor(
121 private readonly UA: string,
122 public index: SearchIndex,
123 ) {
124 super();
125 this.on("addURL", (url: URL) => {
126 console.log(`Adding URL: ${url}`);
127 void this.processPage(url);
128 });
129 this.once("stop", () => {
130 this.removeAllListeners();
131 });
132 }
133
134 private async checkDisallowed(url: URL): Promise<boolean> {
135 const robots =
136 this.robots.get(url.hostname) || (await this.getRobotsTxt(url));
137 return !RobotsParser.checkUserAgent(robots, url.toString());
138 }
139
140 private async getRobotsTxt(url: URL): Promise<RobotUrls> {
141 const robotsTxtUrl = new URL(
142 `${url.protocol}//${url.hostname}/robots.txt`,
143 );
144
145 const response = await fetch(robotsTxtUrl, {
146 headers: {
147 "User-Agent": this.UA,
148 },
149 });
150 if (response.status !== 200)
151 return { allows: new Set(), disallows: new Set() };
152 if (!response.headers.get("content-type")?.startsWith("text/plain"))
153 return { allows: new Set(), disallows: new Set() };
154 const robotsTxt = await response.text();
155 const parsed = new RobotsParser(robotsTxt);
156 const forUA = parsed.getUrlsForUA(this.UA);
157 this.robots.set(url.hostname, forUA);
158 return forUA;
159 }
160
161 private async addOutlinks(html: string): Promise<void> {
162 const links = html.matchAll(urlRegex);
163 if (!links) return;
164 for (const [link, ..._] of links) {
165 console.log(link);
166 const url = new URL(link);
167 if (await this.checkDisallowed(url)) {
168 this.emit("addURL", url);
169 }
170 }
171 }
172
173 // private getText(html: string): string {
174 // const parser = new DOMParser();
175 // const doc = parser.parseFromString(html, "text/html");
176 // return doc.body.textContent || "";
177 // }
178
179 private async getPage(url: URL) {
180 if (this.visited.has(url)) return;
181 if (await this.checkDisallowed(url)) return;
182 const page = await fetch(url);
183 this.visited.add(url);
184 if (!page.ok) return;
185 if (!page.headers.get("Content-Type")?.startsWith("text/html")) return;
186
187 return await page.text();
188 }
189
190 private async processPage(url: URL) {
191 const page = await this.getPage(url);
192 if (!page) return;
193 await this.addOutlinks(page);
194 this.index.addPage(url.toString(), page);
195 this.emit("storePage", url);
196 }
197
198 crawl(url_str: string | URL) {
199 this.emit("addURL", new URL(url_str));
200 }
201}
202
203let crawler = new Crawler("SmartFridge", new SearchIndex());
204
205const url = new URL("https://example.com");
206crawler.crawl(url);
207crawler.on("storePage", (url) => {
208 console.log(`Page stored: ${url}`);
209 console.log("entries:", crawler.index.size());
210 crawler.emit("stop");
211});