A web scraper build to search specific information for a given compound (and its pseudonyms)
at develop 149 lines 6.1 kB view raw
1import re 2 3from scrapy.http import Request 4from scrapy import log 5from scrapy.selector import Selector 6 7from source import Source 8from FourmiCrawler.items import Result 9 10 11class PubChem(Source): 12 """ PubChem scraper for chemical properties 13 14 This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, 15 including sources of the values of properties. 16 """ 17 18 # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used 19 website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' 20 website_www = 'http://www.ncbi.nlm.nih.gov/*' 21 website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' 22 search = 'pccompound?term=%s' 23 data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' 24 25 __spider = None 26 searched_compounds = set() 27 28 def __init__(self, config): 29 Source.__init__(self, config) 30 self.cfg = config 31 32 def parse(self, response): 33 """ 34 Distributes the above described behaviour 35 :param response: The incoming search request 36 :return Returns the found properties if response is unique or returns none if it's already known 37 """ 38 requests = [] 39 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 40 41 sel = Selector(response) 42 compound = sel.xpath('//h1/text()').extract()[0] 43 if compound in self.searched_compounds: 44 return None 45 46 self.searched_compounds.update(compound) 47 raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] 48 for synonym in raw_synonyms.strip().split(', '): 49 log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) 50 self.searched_compounds.update(synonym) 51 self._spider.get_synonym_requests(synonym) 52 log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) 53 54 n = re.search(r'cid=(\d+)', response.url) 55 if n: 56 cid = n.group(1) 57 log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach 58 # the seperate html page which contains the properties and their values 59 60 # using this cid to get the right url and scrape it 61 requests.append( 62 Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data)) 63 return requests 64 65 def parse_data(self, response): 66 """ 67 Parse data found in 'Chemical and Physical properties' part of a substance page. 68 :param response: The response with the page to parse 69 :return: requests: Returns a list of properties with their values, source, etc. 70 """ 71 log.msg('parsing data', level=log.DEBUG) 72 requests = [] 73 74 sel = Selector(response) 75 props = sel.xpath('//div') 76 77 for prop in props: 78 prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing 79 if prop.xpath('a'): # parsing for single value in property 80 prop_source = ''.join(prop.xpath('a/@title').extract()) 81 prop_value = ''.join(prop.xpath('a/text()').extract()) 82 new_prop = Result({ 83 'attribute': prop_name, 84 'value': prop_value, 85 'source': prop_source, 86 'reliability': self.cfg['reliability'], 87 'conditions': '' 88 }) 89 log.msg('PubChem prop: |%s| |%s| |%s|' % 90 (new_prop['attribute'], new_prop['value'], 91 new_prop['source']), level=log.DEBUG) 92 requests.append(new_prop) 93 elif prop.xpath('ul'): # parsing for multiple values (list) in property 94 prop_values = prop.xpath('ul//li') 95 for prop_li in prop_values: 96 prop_value = ''.join(prop_li.xpath('a/text()').extract()) 97 prop_source = ''.join(prop_li.xpath('a/@title').extract()) 98 new_prop = Result({ 99 'attribute': prop_name, 100 'value': prop_value, 101 'source': prop_source, 102 'reliability': self.cfg['reliability'], 103 'conditions': '' 104 }) 105 log.msg('PubChem prop: |%s| |%s| |%s|' % 106 (new_prop['attribute'], new_prop['value'], 107 new_prop['source']), level=log.DEBUG) 108 requests.append(new_prop) 109 110 return requests 111 112 def parse_searchrequest(self, response): 113 """ 114 This function parses the response to the new_compound_request Request 115 :param response: the Response object to be parsed 116 :return: A Request for the compound page or what self.parse returns in 117 case the search request forwarded to the compound page 118 """ 119 120 # check if pubchem forwarded straight to compound page 121 m = re.match(self.website_pubchem, response.url) 122 if m: 123 log.msg('PubChem search forwarded to compound page', 124 level=log.DEBUG) 125 return self.parse(response) 126 127 sel = Selector(response) 128 129 results = sel.xpath('//div[@class="rsltcont"]') 130 if results: 131 url = results[0].xpath('div/p/a[1]/@href') 132 else: 133 log.msg('PubChem search found nothing or xpath failed', 134 level=log.DEBUG) 135 return None 136 137 if url: 138 url = 'http:' + ''.join(url[0].extract()) 139 log.msg('PubChem compound page: %s' % url, level=log.DEBUG) 140 else: 141 log.msg('PubChem search found results, but no url in first result', 142 level=log.DEBUG) 143 return None 144 145 return Request(url=url, callback=self.parse) 146 147 def new_compound_request(self, compound): 148 return Request(url=self.website_www[:-1] + self.search % compound, 149 callback=self.parse_searchrequest)