import { SearchIndex } from "."; import { toRegex } from "glob-to-regex.js"; import { EventEmitter } from "node:events"; interface RobotUrls { allows: Set; disallows: Set; } export class RobotsParser { disallow: Map> = new Map(); allow: Map> = new Map(); constructor(text: string) { const lines = text .split("\n") .filter((l) => !/^\s*#.*$/.test(l)) // remove full-line comments .map((l) => l.replace(/\s*#.*$/, "")); // remove end-of-line comments lines.push(""); const blocks: Array> = []; let current_block: Array = []; lines.forEach((line) => { if (line == "") { if (current_block.length == 0) return; // ignore consecutive empty lines blocks.push(current_block); current_block = new Array(); } else { current_block.push(line); } }); blocks.forEach((block) => { let uas: string[] = []; let disallows: string[] = []; let allows: string[] = []; block.forEach((line) => { line = line.trim().toLowerCase(); const fields: Array = line.split(/\s*:\s*/); if (fields.length < 2) return; if (fields[0] == "user-agent") { uas.push(fields[1]!); } else if (fields[0] == "disallow") { disallows.push(fields[1]!); } else if (fields[0] == "allow") { allows.push(fields[1]!); } }); uas.forEach((ua) => { ua = ua.toLowerCase(); this.disallow.set( ua, new Set([...(this.disallow.get(ua) || []), ...disallows]), ); this.allow.set( ua, new Set([...(this.allow.get(ua) || []), ...allows]), ); }); }); } static checkUserAgent(urls: RobotUrls, url: string): boolean { const { allows, disallows } = urls; const allowed = allows .values() .map((allow) => { const regex = toRegex(allow); return regex.test(url); }) .reduce((acc, curr) => acc || curr, false); if (allowed) { return true; } const disallowed = disallows .values() .map((disallow) => { const regex = toRegex(disallow); return regex.test(url); }) .reduce((acc, curr) => acc || curr, false); return !disallowed; } getUrlsForUA(ua: string): RobotUrls { ua = ua.toLowerCase(); const allowUAs = this.allow .keys() .filter((key) => toRegex(key).test(ua)); const disallowUAs = this.disallow .keys() .filter((key) => toRegex(key).test(ua)); let allows = new Set(); let disallows = new Set(); allowUAs.forEach((ua) => { const allow = this.allow.get(ua); if (allow) { allows = allows.union(allow); } }); disallowUAs.forEach((ua) => { const disallow = this.disallow.get(ua); if (disallow) { disallows = disallows.union(disallow); } }); return { allows, disallows, }; } } const urlRegex = /https?:\/\/[^\s\"]+/g; export class Crawler extends EventEmitter { private robots: Map = new Map(); // hostname, robots allowed and disallowed for the sepcified UA private visited: Set = new Set(); // URLS constructor( private readonly UA: string, public index: SearchIndex, ) { super(); this.on("addURL", (url: URL) => { console.log(`Adding URL: ${url}`); void this.processPage(url); }); this.once("stop", () => { this.removeAllListeners(); }); } private async checkDisallowed(url: URL): Promise { const robots = this.robots.get(url.hostname) || (await this.getRobotsTxt(url)); return !RobotsParser.checkUserAgent(robots, url.toString()); } private async getRobotsTxt(url: URL): Promise { const robotsTxtUrl = new URL( `${url.protocol}//${url.hostname}/robots.txt`, ); const response = await fetch(robotsTxtUrl, { headers: { "User-Agent": this.UA, }, }); if (response.status !== 200) return { allows: new Set(), disallows: new Set() }; if (!response.headers.get("content-type")?.startsWith("text/plain")) return { allows: new Set(), disallows: new Set() }; const robotsTxt = await response.text(); const parsed = new RobotsParser(robotsTxt); const forUA = parsed.getUrlsForUA(this.UA); this.robots.set(url.hostname, forUA); return forUA; } private async addOutlinks(html: string): Promise { const links = html.matchAll(urlRegex); if (!links) return; for (const [link, ..._] of links) { console.log(link); const url = new URL(link); if (await this.checkDisallowed(url)) { this.emit("addURL", url); } } } // private getText(html: string): string { // const parser = new DOMParser(); // const doc = parser.parseFromString(html, "text/html"); // return doc.body.textContent || ""; // } private async getPage(url: URL) { if (this.visited.has(url)) return; if (await this.checkDisallowed(url)) return; const page = await fetch(url); this.visited.add(url); if (!page.ok) return; if (!page.headers.get("Content-Type")?.startsWith("text/html")) return; return await page.text(); } private async processPage(url: URL) { const page = await this.getPage(url); if (!page) return; await this.addOutlinks(page); this.index.addPage(url.toString(), page); this.emit("storePage", url); } crawl(url_str: string | URL) { this.emit("addURL", new URL(url_str)); } } let crawler = new Crawler("SmartFridge", new SearchIndex()); const url = new URL("https://example.com"); crawler.crawl(url); crawler.on("storePage", (url) => { console.log(`Page stored: ${url}`); console.log("entries:", crawler.index.size()); crawler.emit("stop"); });