A web scraper build to search specific information for a given compound (and its pseudonyms)
1import re
2
3from scrapy import log
4from scrapy.http import Request
5from scrapy.selector import Selector
6
7from source import Source
8from FourmiCrawler.items import Result
9
10
11# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
12
13
14class ChemSpider(Source):
15 """ChemSpider scraper for synonyms and properties
16
17 This parser will manage searching for chemicals through the
18 ChemsSpider API, and parsing the resulting ChemSpider page.
19 The token required for the API should be in a configuration file
20 somewhere.
21 """
22
23 def __init__(self):
24 Source.__init__(self)
25
26 website = 'http://www.chemspider.com/*'
27
28 # [TODO] - Save and access token of specific user.
29 search = ('Search.asmx/SimpleSearch?query=%s&token='
30 '052bfd06-5ce4-43d6-bf12-89eabefd2338')
31 structure = 'Chemical-Structure.%s.html'
32 extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
33 '052bfd06-5ce4-43d6-bf12-89eabefd2338')
34
35 ignore_list = []
36
37 def parse(self, response):
38 sel = Selector(response)
39 requests = []
40 requests_synonyms = self.parse_synonyms(sel)
41 requests.extend(requests_synonyms)
42 requests_properties = self.parse_properties(sel)
43 requests.extend(requests_properties)
44
45 return requests
46
47 @staticmethod
48 def parse_properties(sel):
49 """scrape Experimental Data and Predicted ACD/Labs tabs"""
50 properties = []
51
52 # Predicted - ACD/Labs tab
53 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
54 'normalize-space(string())')
55 prop_names = td_list[::2]
56 prop_values = td_list[1::2]
57 for (prop_name, prop_value) in zip(prop_names, prop_values):
58 # [:-1] is to remove the colon at the end, [TODO] - test for colon
59 prop_name = prop_name.extract().encode('utf-8')[:-1]
60 prop_value = prop_value.extract().encode('utf-8')
61 prop_conditions = ''
62
63 # Test for properties without values, with one hardcoded exception
64 if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
65 continue
66
67 # Match for condition in parentheses
68 m = re.match(r'(.*) \((.*)\)', prop_name)
69 if m:
70 prop_name = m.group(1)
71 prop_conditions = m.group(2)
72
73 # Match for condition in value seperated by an 'at'
74 m = re.match(r'(.*) at (.*)', prop_value)
75 if m:
76 prop_value = m.group(1)
77 prop_conditions = m.group(2)
78
79 new_prop = Result({
80 'attribute': prop_name,
81 'value': prop_value,
82 'source': 'ChemSpider Predicted - ACD/Labs Tab',
83 'reliability': 'Unknown',
84 'conditions': prop_conditions
85 })
86 properties.append(new_prop)
87 log.msg('CS prop: |%s| |%s| |%s|' %
88 (new_prop['attribute'], new_prop['value'], new_prop['source']),
89 level=log.DEBUG)
90
91 # Experimental Data Tab, Physico-chemical properties in particular
92 scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
93 'Properties"]//li/table/tr/td')
94 if not scraped_list:
95 return properties
96 # Format is: property name followed by a list of values
97 property_name = scraped_list.pop(0).xpath(
98 'span/text()').extract()[0].rstrip()
99 for line in scraped_list:
100 if line.xpath('span/text()'):
101 property_name = line.xpath('span/text()').extract()[0].rstrip()
102 else:
103 new_prop = Result({
104 'attribute': property_name[:-1],
105 'value': line.xpath('text()').extract()[0].rstrip(),
106 'source': line.xpath(
107 'strong/text()').extract()[0].rstrip(),
108 'reliability': 'Unknown',
109 'conditions': ''
110 })
111 properties.append(new_prop)
112 log.msg('CS prop: |%s| |%s| |%s|' %
113 (new_prop['attribute'], new_prop['value'],
114 new_prop['source']), level=log.DEBUG)
115
116 return properties
117
118 def parse_synonyms(self, sel):
119 """Scrape list of Names and Identifiers"""
120 requests = []
121 synonyms = []
122
123 # Exact type for this is unknown, but equivalent to Validated by Expert
124 for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
125 name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
126 synonyms.append(self.new_synonym(syn, name, 'expert'))
127 # These synonyms are labeled by ChemSpider as "Validated by Experts"
128 for syn in sel.xpath('//p[@class="syn"][strong]'):
129 name = syn.xpath('strong/text()').extract()[0]
130 synonyms.append(self.new_synonym(syn, name, 'expert'))
131 # These synonyms are labeled by ChemSpider as "Validated by Users"
132 for syn in sel.xpath(
133 '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
134 name = syn.xpath(
135 'span[@class="synonym_confirmed"]/text()').extract()[0]
136 synonyms.append(self.new_synonym(syn, name, 'user'))
137 # These syonyms are labeled as "Non-validated" and assumed unreliable
138 for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
139 name = syn.xpath('span[@class=""]/text()').extract()[0]
140 synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))
141
142 # [TODO] - confirm if English User-Validated synonyms are OK too
143 for syn in synonyms:
144 if syn['category'] == 'expert' and syn['language'] == 'English':
145 log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
146 self._spider.get_synonym_requests(syn['name'])
147
148 return requests
149
150 def new_synonym(self, sel, name, category):
151 """Scrape for a single synonym at a given HTML tag"""
152 self.ignore_list.append(name)
153 language = sel.xpath('span[@class="synonym_language"]/text()')
154 if language:
155 # The [1:-1] is to remove brackets around the language name
156 language = language.extract()[0][1:-1]
157 else:
158 # If language is not given, English is assumed, [TODO] - confirm
159 language = 'English'
160 log.msg('CS synonym: %s (%s) (%s)' % (name, category, language),
161 level=log.DEBUG)
162 references = []
163 # A synonym can have multiple references, each optionally with link
164 for ref in sel.xpath('span[@class="synonym_ref"]'):
165 refname = ref.xpath('normalize-space(string())')
166 references.append({
167 'name': refname.extract()[0][1:-1],
168 'URI': ''
169 })
170 for ref in sel.xpath('a[@class="synonym_ref"]'):
171 references.append({
172 'name': ref.xpath('@title').extract()[0],
173 'URI': ref.xpath('@href').extract()[0]
174 })
175 for ref in references:
176 log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']),
177 level=log.DEBUG)
178 synonym = {
179 'name': name,
180 'category': category,
181 'language': language,
182 'references': references
183 }
184 return synonym
185
186 @staticmethod
187 def parse_extendedinfo(response):
188 """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
189 sel = Selector(response)
190 properties = []
191 names = sel.xpath('*').xpath('name()').extract()
192 values = sel.xpath('*').xpath('text()').extract()
193 for (name, value) in zip(names, values):
194 result = Result({
195 'attribute': name,
196 'value': value, # These values have no unit!
197 'source': 'ChemSpider ExtendedCompoundInfo',
198 'reliability': 'Unknown',
199 'conditions': ''
200 })
201 if result['value']:
202 properties.append(result)
203 return properties
204
205 def parse_searchrequest(self, response):
206 """Parse the initial response of the ChemSpider Search API """
207 sel = Selector(response)
208 log.msg('chemspider parse_searchrequest', level=log.DEBUG)
209 sel.register_namespace('cs', 'http://www.chemspider.com/')
210 csids = sel.xpath('.//cs:int/text()').extract()
211 if len(csids) == 0:
212 log.msg('ChemSpider found nothing', level=log.ERROR)
213 return
214 elif len(csids) > 1:
215 log.msg('ChemSpider found multiple substances, taking first '
216 'element', level=log.DEBUG)
217 csid = csids[0]
218 structure_url = self.website[:-1] + self.structure % csid
219 extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
220 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
221 return [Request(url=structure_url,
222 callback=self.parse),
223 Request(url=extendedinfo_url,
224 callback=self.parse_extendedinfo)]
225
226 def new_compound_request(self, compound):
227 if compound in self.ignore_list: # [TODO] - add regular expression
228 return None
229 searchurl = self.website[:-1] + self.search % compound
230 log.msg('chemspider compound', level=log.DEBUG)
231 return Request(url=searchurl, callback=self.parse_searchrequest)