A web scraper build to search specific information for a given compound (and its pseudonyms)
1import re
2
3from scrapy.http import Request
4from scrapy import log
5from scrapy.selector import Selector
6
7from source import Source
8from FourmiCrawler.items import Result
9
10
11class PubChem(Source):
12 """ PubChem scraper for chemical properties
13
14 This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
15 including sources of the values of properties.
16 """
17
18 # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
19 website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
20 website_www = 'http://www.ncbi.nlm.nih.gov/*'
21 website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
22 search = 'pccompound?term=%s'
23 data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
24
25 __spider = None
26 searched_compounds = set()
27
28 def __init__(self, config):
29 Source.__init__(self, config)
30 self.cfg = config
31
32 def parse(self, response):
33 """
34 Distributes the above described behaviour
35 :param response: The incoming search request
36 :return Returns the found properties if response is unique or returns none if it's already known
37 """
38 requests = []
39 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
40
41 sel = Selector(response)
42 compound = sel.xpath('//h1/text()').extract()[0]
43 if compound in self.searched_compounds:
44 return None
45
46 self.searched_compounds.update(compound)
47 raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
48 for synonym in raw_synonyms.strip().split(', '):
49 log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
50 self.searched_compounds.update(synonym)
51 self._spider.get_synonym_requests(synonym)
52 log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
53
54 n = re.search(r'cid=(\d+)', response.url)
55 if n:
56 cid = n.group(1)
57 log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
58 # the seperate html page which contains the properties and their values
59
60 # using this cid to get the right url and scrape it
61 requests.append(
62 Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
63 return requests
64
65 def parse_data(self, response):
66 """
67 Parse data found in 'Chemical and Physical properties' part of a substance page.
68 :param response: The response with the page to parse
69 :return: requests: Returns a list of properties with their values, source, etc.
70 """
71 log.msg('parsing data', level=log.DEBUG)
72 requests = []
73
74 sel = Selector(response)
75 props = sel.xpath('//div')
76
77 for prop in props:
78 prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
79 if prop.xpath('a'): # parsing for single value in property
80 prop_source = ''.join(prop.xpath('a/@title').extract())
81 prop_value = ''.join(prop.xpath('a/text()').extract())
82 new_prop = Result({
83 'attribute': prop_name,
84 'value': prop_value,
85 'source': prop_source,
86 'reliability': self.cfg['reliability'],
87 'conditions': ''
88 })
89 log.msg('PubChem prop: |%s| |%s| |%s|' %
90 (new_prop['attribute'], new_prop['value'],
91 new_prop['source']), level=log.DEBUG)
92 requests.append(new_prop)
93 elif prop.xpath('ul'): # parsing for multiple values (list) in property
94 prop_values = prop.xpath('ul//li')
95 for prop_li in prop_values:
96 prop_value = ''.join(prop_li.xpath('a/text()').extract())
97 prop_source = ''.join(prop_li.xpath('a/@title').extract())
98 new_prop = Result({
99 'attribute': prop_name,
100 'value': prop_value,
101 'source': prop_source,
102 'reliability': self.cfg['reliability'],
103 'conditions': ''
104 })
105 log.msg('PubChem prop: |%s| |%s| |%s|' %
106 (new_prop['attribute'], new_prop['value'],
107 new_prop['source']), level=log.DEBUG)
108 requests.append(new_prop)
109
110 return requests
111
112 def parse_searchrequest(self, response):
113 """
114 This function parses the response to the new_compound_request Request
115 :param response: the Response object to be parsed
116 :return: A Request for the compound page or what self.parse returns in
117 case the search request forwarded to the compound page
118 """
119
120 # check if pubchem forwarded straight to compound page
121 m = re.match(self.website_pubchem, response.url)
122 if m:
123 log.msg('PubChem search forwarded to compound page',
124 level=log.DEBUG)
125 return self.parse(response)
126
127 sel = Selector(response)
128
129 results = sel.xpath('//div[@class="rsltcont"]')
130 if results:
131 url = results[0].xpath('div/p/a[1]/@href')
132 else:
133 log.msg('PubChem search found nothing or xpath failed',
134 level=log.DEBUG)
135 return None
136
137 if url:
138 url = 'http:' + ''.join(url[0].extract())
139 log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
140 else:
141 log.msg('PubChem search found results, but no url in first result',
142 level=log.DEBUG)
143 return None
144
145 return Request(url=url, callback=self.parse)
146
147 def new_compound_request(self, compound):
148 return Request(url=self.website_www[:-1] + self.search % compound,
149 callback=self.parse_searchrequest)