A web scraper build to search specific information for a given compound (and its pseudonyms)

added ignore list

RTB 56ee6b1a 98f58ea4

+8 -2
+8 -2
FourmiCrawler/sources/NIST.py
··· 5 5 from FourmiCrawler.items import Result 6 6 import re 7 7 8 + # [TODO]: values can be '128.', perhaps remove the dot in that case? 9 + 8 10 class NIST(Source): 9 11 website = "http://webbook.nist.gov/*" 10 12 11 13 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 14 + 15 + ignore_list = set() 12 16 13 17 def __init__(self): 14 18 Source.__init__(self) ··· 235 239 return results 236 240 237 241 def new_compound_request(self, compound): 238 - return Request(url=self.website[:-1] + self.search % compound, 239 - callback=self.parse) 242 + if compound not in self.ignore_list: 243 + self.ignore_list.update(compound) 244 + return Request(url=self.website[:-1] + self.search % compound, 245 + callback=self.parse)