A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'feature/PubChem-fixes' into develop

+63 -27
+1 -4
FourmiCrawler/settings.py
··· 21 21 # Crawl responsibly by identifying yourself (and your website) on the 22 22 # user-agent 23 23 24 - # [todo] - Check for repercussions on spoofing the user agent 25 - 26 - # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' 27 - USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' 24 + USER_AGENT = 'Fourmi'
+6 -5
FourmiCrawler/sources/ChemSpider.py
··· 1 + import re 2 + 1 3 from scrapy import log 2 4 from scrapy.http import Request 3 5 from scrapy.selector import Selector ··· 5 7 from source import Source 6 8 from FourmiCrawler.items import Result 7 9 8 - import re 9 10 10 11 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. 11 12 ··· 18 19 somewhere. 19 20 """ 20 21 21 - website = 'http://www.chemspider.com/*' 22 + website = 'http://www\\.chemspider\\.com/.*' 22 23 23 24 search = 'Search.asmx/SimpleSearch?query=%s&token=' 24 25 structure = 'Chemical-Structure.%s.html' ··· 276 277 log.msg('ChemSpider found multiple substances, taking first ' 277 278 'element', level=log.DEBUG) 278 279 csid = csids[0] 279 - structure_url = self.website[:-1] + self.structure % csid 280 - extendedinfo_url = self.website[:-1] + self.extendedinfo % csid 280 + structure_url = self.website[:-2].replace("\\", "") + self.structure % csid 281 + extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid 281 282 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) 282 283 return [Request(url=structure_url, 283 284 callback=self.parse), ··· 292 293 """ 293 294 if compound in self.ignore_list or self.cfg['token'] == '': 294 295 return None 295 - searchurl = self.website[:-1] + self.search % compound 296 + searchurl = self.website[:-2].replace("\\", "") + self.search % compound 296 297 log.msg('chemspider compound', level=log.DEBUG) 297 298 return Request(url=searchurl, callback=self.parse_searchrequest)
+3 -3
FourmiCrawler/sources/NIST.py
··· 18 18 This plugin manages searching for a chemical on the NIST website 19 19 and parsing the resulting page if the chemical exists on NIST. 20 20 """ 21 - website = "http://webbook.nist.gov/*" 21 + website = "http://webbook\\.nist\\.gov/.*" 22 22 23 23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 24 ··· 164 164 extra_data_url = tr.xpath('td[last()][a="Individual data points"]' 165 165 '/a/@href').extract() 166 166 if extra_data_url: 167 - request = Request(url=self.website[:-1] + extra_data_url[0], 167 + request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0], 168 168 callback=self.parse_individual_datapoints) 169 169 results.append(request) 170 170 continue ··· 329 329 """ 330 330 if compound not in self.ignore_list: 331 331 self.ignore_list.update(compound) 332 - return Request(url=self.website[:-1] + self.search % compound, 332 + return Request(url=self.website[:-2].replace("\\", "") + self.search % compound, 333 333 callback=self.parse)
+45 -8
FourmiCrawler/sources/PubChem.py
··· 1 + import re 2 + 1 3 from scrapy.http import Request 2 4 from scrapy import log 3 - from source import Source 4 5 from scrapy.selector import Selector 6 + 7 + from source import Source 5 8 from FourmiCrawler.items import Result 6 - import re 7 9 8 10 9 11 class PubChem(Source): ··· 14 16 """ 15 17 16 18 #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used 17 - website = 'https://*.ncbi.nlm.nih.gov/*' 18 - website_www = 'https://www.ncbi.nlm.nih.gov/*' 19 - website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' 19 + website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' 20 + website_www = 'http://www.ncbi.nlm.nih.gov/*' 21 + website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' 20 22 search = 'pccompound?term=%s' 21 23 data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' 22 24 ··· 49 51 self._spider.get_synonym_requests(synonym) 50 52 log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) 51 53 52 - n = re.search(r'cid=(\d+)',response.url) 54 + n = re.search(r'cid=(\d+)', response.url) 53 55 if n: 54 56 cid = n.group(1) 55 57 log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach 56 58 # the seperate html page which contains the properties and their values 57 59 58 60 #using this cid to get the right url and scrape it 59 - requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) 61 + requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data)) 60 62 return requests 61 63 62 64 def parse_data(self, response): ··· 106 108 107 109 return requests 108 110 111 + def parse_searchrequest(self, response): 112 + """ 113 + This function parses the response to the new_compound_request Request 114 + :param response: the Response object to be parsed 115 + :return: A Request for the compound page or what self.parse returns in 116 + case the search request forwarded to the compound page 117 + """ 118 + 119 + #check if pubchem forwarded straight to compound page 120 + m = re.match(self.website_pubchem, response.url) 121 + if m: 122 + log.msg('PubChem search forwarded to compound page', 123 + level=log.DEBUG) 124 + return self.parse(response) 125 + 126 + sel = Selector(response) 127 + 128 + results = sel.xpath('//div[@class="rsltcont"]') 129 + if results: 130 + url = results[0].xpath('div/p/a[1]/@href') 131 + else: 132 + log.msg('PubChem search found nothing or xpath failed', 133 + level=log.DEBUG) 134 + return None 135 + 136 + if url: 137 + url = 'http:' + ''.join(url[0].extract()) 138 + log.msg('PubChem compound page: %s' % url, level=log.DEBUG) 139 + else: 140 + log.msg('PubChem search found results, but no url in first result', 141 + level=log.DEBUG) 142 + return None 143 + 144 + return Request(url=url, callback=self.parse) 109 145 110 146 def new_compound_request(self, compound): 111 - return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) 147 + return Request(url=self.website_www[:-1] + self.search % compound, 148 + callback=self.parse_searchrequest)
+2 -2
FourmiCrawler/sources/WikipediaParser.py
··· 15 15 It also returns requests with other external sources which contain information on parsed subject. 16 16 """ 17 17 18 - website = "http://en.wikipedia.org/wiki/*" 18 + website = "http://en\\.wikipedia\\.org/wiki/.*" 19 19 __spider = None 20 20 searched_compounds = [] 21 21 ··· 123 123 return items 124 124 125 125 def new_compound_request(self, compound): 126 - return Request(url=self.website[:-1] + compound, callback=self.parse) 126 + return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) 127 127 128 128 @staticmethod 129 129 def clean_items(items):
+2 -2
FourmiCrawler/sources/source.py
··· 3 3 4 4 5 5 class Source: 6 - website = "http://something/*" # Regex of URI's the source is able to parse 6 + website = "http://something/.*" # Regex of URI's the source is able to parse 7 7 _spider = None 8 8 9 9 def __init__(self, config=None): ··· 30 30 :param compound: A compound name. 31 31 :return: A new Scrapy Request 32 32 """ 33 - # return Request(url=self.website[:-1] + compound, callback=self.parse) 33 + # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) 34 34 pass 35 35 36 36 def set_spider(self, spider):
+2 -1
FourmiCrawler/spider.py
··· 34 34 """ 35 35 for source in self._sources: 36 36 if re.match(source.website, response.url): 37 - log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) 37 + log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG) 38 38 return source.parse(response) 39 + log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO) 39 40 return None 40 41 41 42 def get_synonym_requests(self, compound, force=False):
+2 -2
utils/configurator.py
··· 1 1 import ConfigParser 2 + import os 2 3 3 4 from scrapy.utils.project import get_project_settings 4 - import os 5 5 6 6 class Configurator: 7 7 """ ··· 67 67 :return a ConfigParser object of sources.cfg 68 68 """ 69 69 current_dir = os.path.dirname(os.path.abspath(__file__)) 70 - config_path = current_dir + '\..\sources.cfg' 70 + config_path = current_dir + '/../sources.cfg' 71 71 # [TODO]: location of sources.cfg should be softcoded eventually 72 72 config = ConfigParser.ConfigParser() 73 73 config.read(config_path)