A web scraper build to search specific information for a given compound (and its pseudonyms)

A lot of PEP-8 fixes

+25 -27
+3 -3
FourmiCrawler/sources/ChemSpider.py
··· 40 40 self.search += self.cfg['token'] 41 41 self.extendedinfo += self.cfg['token'] 42 42 43 - 44 43 def parse(self, response): 45 44 sel = Selector(response) 46 45 requests = [] ··· 202 201 return properties 203 202 204 203 def newresult(self, attribute, value, conditions='', source='ChemSpider'): 205 - return Result({ 204 + return Result( 205 + { 206 206 'attribute': attribute, 207 207 'value': value, 208 208 'source': source, 209 209 'reliability': self.cfg['reliability'], 210 210 'conditions': conditions 211 - }) 211 + }) 212 212 213 213 def parse_searchrequest(self, response): 214 214 """Parse the initial response of the ChemSpider Search API """
+7 -7
FourmiCrawler/sources/NIST.py
··· 89 89 InChiKey, CAS number 90 90 """ 91 91 ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') 92 - li = ul.xpath('li') 93 92 94 93 raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() 95 94 for synonym in raw_synonyms[0].strip().split(';\n'): ··· 256 255 return results 257 256 258 257 def newresult(self, attribute, value, conditions=''): 259 - return Result({ 260 - 'attribute': attribute, 261 - 'value': value, 262 - 'source': 'NIST', 263 - 'reliability': self.cfg['reliability'], 264 - 'conditions': conditions 258 + return Result( 259 + { 260 + 'attribute': attribute, 261 + 'value': value, 262 + 'source': 'NIST', 263 + 'reliability': self.cfg['reliability'], 264 + 'conditions': conditions 265 265 }) 266 266 267 267 def new_compound_request(self, compound):
+8 -9
FourmiCrawler/sources/WikipediaParser.py
··· 19 19 __spider = None 20 20 searched_compounds = [] 21 21 22 - 23 22 def __init__(self, config=None): 24 23 Source.__init__(self, config) 25 24 if config is None: ··· 57 56 # scrape the chembox (wikipedia template) 58 57 items = self.parse_chembox(sel, items) 59 58 60 - #scrape the drugbox (wikipedia template) 59 + # scrape the drugbox (wikipedia template) 61 60 items = self.parse_drugbox(sel, items) 62 61 63 62 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value ··· 127 126 level=log.DEBUG) 128 127 return items 129 128 130 - 131 129 def new_compound_request(self, compound): 132 130 return Request(url=self.website[:-1] + compound, callback=self.parse) 133 131 ··· 165 163 return links 166 164 167 165 def newresult(self, attribute, value): 168 - return Result({ 169 - 'attribute': attribute, 170 - 'value': value, 171 - 'source': 'Wikipedia', 172 - 'reliability': self.cfg['reliability'], 173 - 'conditions': '' 166 + return Result( 167 + { 168 + 'attribute': attribute, 169 + 'value': value, 170 + 'source': 'Wikipedia', 171 + 'reliability': self.cfg['reliability'], 172 + 'conditions': '' 174 173 })
+1 -1
FourmiCrawler/spider.py
··· 21 21 super(FourmiSpider, self).__init__(*args, **kwargs) 22 22 self.synonyms.add(compound) 23 23 if selected_attributes is None: 24 - self.selected_attributes = [".*"] 24 + self.selected_attributes = [".*"] 25 25 else: 26 26 self.selected_attributes = selected_attributes 27 27
+4 -2
fourmi.py
··· 60 60 conf = Configurator() 61 61 conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) 62 62 conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) 63 - setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) 64 - log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 63 + setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, 64 + source_loader, docopt_arguments["--attributes"].split(',')) 65 + log.start(conf.scrapy_settings.get("LOG_FILE"), 66 + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 65 67 reactor.run() 66 68 67 69
-1
tests/test_spider.py
··· 47 47 self.assertGreater(len(requests), 0) 48 48 self.assertIsInstance(requests[0], Request) 49 49 50 - 51 50 def test_synonym_requests(self): 52 51 # A test for the synonym request function 53 52 self.spi._sources = []
+1 -4
utils/configurator.py
··· 12 12 def __init__(self): 13 13 self.scrapy_settings = get_project_settings() 14 14 15 - 16 15 def set_output(self, filename, fileformat): 17 16 """ 18 17 This function manipulates the Scrapy output file settings that normally would be set in the settings file. ··· 30 29 31 30 if fileformat is not None: 32 31 self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat 33 - 34 32 35 33 def set_logging(self, logfile=None, verbose=0): 36 34 """ ··· 61 59 else: 62 60 self.scrapy_settings.overrides["LOG_FILE"] = None 63 61 64 - 65 62 @staticmethod 66 63 def read_sourceconfiguration(): 67 64 """ ··· 70 67 :return a ConfigParser object of sources.cfg 71 68 """ 72 69 config = ConfigParser.ConfigParser() 73 - config.read('sources.cfg') # [TODO]: should be softcoded eventually 70 + config.read('sources.cfg') # [TODO]: should be softcoded eventually 74 71 return config 75 72 76 73 @staticmethod
+1
utils/sourceloader.py
··· 5 5 from FourmiCrawler.sources.source import Source 6 6 from utils.configurator import Configurator 7 7 8 + 8 9 class SourceLoader: 9 10 sources = [] 10 11