A web scraper build to search specific information for a given compound (and its pseudonyms)
1import re
2
3from scrapy import log
4from scrapy.http import Request
5from scrapy.selector import Selector
6
7from source import Source
8from FourmiCrawler.items import Result
9
10
11# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
12
13class ChemSpider(Source):
14 """
15 ChemSpider scraper for synonyms and properties
16 This parser will manage searching for chemicals through the
17 ChemsSpider API, and parsing the resulting ChemSpider page.
18 The token required for the API should be in a configuration file
19 somewhere.
20 """
21
22 website = 'http://www\\.chemspider\\.com/.*'
23
24 search = 'Search.asmx/SimpleSearch?query=%s&token='
25 structure = 'Chemical-Structure.%s.html'
26 extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
27
28 def __init__(self, config=None):
29 """
30 Initialization of ChemSpider scraper
31 :param config: a dictionary of settings for this scraper, must contain
32 'reliability' key
33 """
34 Source.__init__(self, config)
35 self.ignore_list = []
36 if 'token' not in self.cfg or self.cfg['token'] == '':
37 log.msg('ChemSpider token not set or empty, search/MassSpec API '
38 'not available', level=log.WARNING)
39 self.cfg['token'] = ''
40 self.search += self.cfg['token']
41 self.extendedinfo += self.cfg['token']
42
43 def parse(self, response):
44 """
45 This function is called when a Response matching the variable
46 'website' is available for parsing the Response object.
47 :param response: the Scrapy Response object to be parsed
48 :return: a list of Result items and Request objects
49 """
50 sel = Selector(response)
51 requests = []
52 requests_synonyms = self.parse_synonyms(sel)
53 requests.extend(requests_synonyms)
54 requests_properties = self.parse_properties(sel)
55 requests.extend(requests_properties)
56
57 return requests
58
59 def parse_properties(self, sel):
60 """
61 This function scrapes the Experimental Data and Predicted ACD/Labs tabs
62 :param sel: a Selector object of the whole page
63 :return: a list of Result items
64 """
65 properties = []
66
67 properties.extend(self.parse_acdlabstab(sel))
68 properties.extend(self.parse_experimentaldatatab(sel))
69
70 return properties
71
72 def parse_acdlabstab(self, sel):
73 """
74 This function scrapes the 'Predicted ACD/Labs tab' under Properties
75 :param sel: a Selector object of the whole page
76 :return: a list of Request objects
77 """
78 properties = []
79
80 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
81 'normalize-space(string())')
82 prop_names = td_list[::2]
83 prop_values = td_list[1::2]
84 for (prop_name, prop_value) in zip(prop_names, prop_values):
85 # [:-1] is to remove the colon at the end, [TODO] - test for colon
86 prop_name = prop_name.extract().encode('utf-8')[:-1]
87 prop_value = prop_value.extract().encode('utf-8')
88 prop_conditions = ''
89
90 # Test for properties without values, with one hardcoded exception
91 if (not re.match(r'^\d', prop_value) or
92 (prop_name == 'Polarizability' and prop_value == '10-24cm3')):
93 continue
94
95 m = re.match(r'(.*) \((.*)\)', prop_name)
96 if m:
97 prop_name = m.group(1)
98 prop_conditions = m.group(2)
99
100 m = re.match(r'(.*) at (.*)', prop_value)
101 if m:
102 prop_value = m.group(1)
103 prop_conditions = m.group(2)
104
105 new_prop = self.newresult(
106 attribute=prop_name,
107 value=prop_value,
108 source='ChemSpider Predicted - ACD/Labs Tab',
109 conditions=prop_conditions
110 )
111 properties.append(new_prop)
112
113 return properties
114
115 def parse_experimentaldatatab(self, sel):
116 """
117 This function scrapes Experimental Data tab, Physico-chemical
118 properties in particular.
119 :param sel: a Selector object of the whole page
120 :return: a list of Result items
121 """
122 properties = []
123
124 scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
125 'Properties"]//li/table/tr/td')
126 if not scraped_list:
127 return properties
128 # Format is: property name followed by a list of values
129 property_name = scraped_list.pop(0).xpath(
130 'span/text()').extract()[0].rstrip()
131 for line in scraped_list:
132 if line.xpath('span/text()'):
133 property_name = line.xpath('span/text()').extract()[0].rstrip()
134 else:
135 new_prop = self.newresult(
136 attribute=property_name[:-1],
137 value=line.xpath('text()').extract()[0].rstrip(),
138 source=line.xpath('strong/text()').extract()[0].rstrip(),
139 )
140 properties.append(new_prop)
141
142 return properties
143
144 def parse_synonyms(self, sel):
145 """
146 This function scrapes the list of Names and Identifiers
147 :param sel: a Selector object of the whole page
148 :return: a list of Requests
149 """
150 requests = []
151 synonyms = []
152
153 # Exact type for this is unknown, but equivalent to Validated by Expert
154 for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
155 name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
156 synonyms.append(self.new_synonym(syn, name, 'expert'))
157 # These synonyms are labeled by ChemSpider as "Validated by Experts"
158 for syn in sel.xpath('//p[@class="syn"][strong]'):
159 name = syn.xpath('strong/text()').extract()[0]
160 synonyms.append(self.new_synonym(syn, name, 'expert'))
161 # These synonyms are labeled by ChemSpider as "Validated by Users"
162 for syn in sel.xpath(
163 '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
164 name = syn.xpath(
165 'span[@class="synonym_confirmed"]/text()').extract()[0]
166 synonyms.append(self.new_synonym(syn, name, 'user'))
167 # These syonyms are labeled as "Non-validated" and assumed unreliable
168 for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
169 name = syn.xpath('span[@class=""]/text()').extract()[0]
170 synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))
171
172 # [TODO] - confirm if English User-Validated synonyms are OK too
173 for syn in synonyms:
174 if syn['category'] == 'expert' and syn['language'] == 'English':
175 log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
176 self._spider.get_synonym_requests(syn['name'])
177
178 return requests
179
180 def new_synonym(self, sel, name, category):
181 """
182 This function scrapes for a single synonym at a given HTML tag
183 :param sel: a Selector object of the given HTML tag
184 :param name: the name of the synonym in the tag
185 :param category: the name of the category the synonym is labeled as
186 :return: a dictionary containing data on the synonym
187 """
188 self.ignore_list.append(name)
189 language = sel.xpath('span[@class="synonym_language"]/text()')
190 if language:
191 # The [1:-1] is to remove brackets around the language name
192 language = language.extract()[0][1:-1]
193 else:
194 # If language is not given, English is assumed, [TODO] - confirm
195 language = 'English'
196 log.msg('CS synonym: %s (%s) (%s)' % (name, category, language),
197 level=log.DEBUG)
198 references = []
199 # A synonym can have multiple references, each optionally with link
200 for ref in sel.xpath('span[@class="synonym_ref"]'):
201 refname = ref.xpath('normalize-space(string())')
202 references.append({
203 'name': refname.extract()[0][1:-1],
204 'URI': ''
205 })
206 for ref in sel.xpath('a[@class="synonym_ref"]'):
207 references.append({
208 'name': ref.xpath('@title').extract()[0],
209 'URI': ref.xpath('@href').extract()[0]
210 })
211 for ref in references:
212 log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']),
213 level=log.DEBUG)
214 synonym = {
215 'name': name,
216 'category': category,
217 'language': language,
218 'references': references
219 }
220 return synonym
221
222 def parse_extendedinfo(self, response):
223 """
224 This function scrapes data from the ChemSpider GetExtendedCompoundInfo
225 API, if a token is present in the configuration settings
226 :param response: a Response object to be parsed
227 :return: a list of Result items
228 """
229 sel = Selector(response)
230 properties = []
231 names = sel.xpath('*').xpath('name()').extract()
232 values = sel.xpath('*').xpath('text()').extract()
233 for (name, value) in zip(names, values):
234 result = self.newresult(
235 attribute=name,
236 value=value, # These values have no unit!
237 source='ChemSpider ExtendedCompoundInfo',
238 )
239 if result['value']:
240 properties.append(result)
241 return properties
242
243 def newresult(self, attribute, value, conditions='', source='ChemSpider'):
244 """
245 This function abstracts from the Result item and provides default
246 values.
247 :param attribute: the name of the attribute
248 :param value: the value of the attribute
249 :param conditions: optional conditions regarding the value
250 :param source: the name of the source if it is not ChemSpider
251 :return: A Result item
252 """
253 return Result({
254 'attribute': attribute,
255 'value': value,
256 'source': source,
257 'reliability': self.cfg['reliability'],
258 'conditions': conditions
259 })
260
261 def parse_searchrequest(self, response):
262 """
263 This function parses the initial response of the ChemSpider Search API
264 Requires a valid token to function.
265 :param response: the Response object to be parsed
266 :return: A Request for the information page and a Request for the
267 extendedinfo API call
268 """
269 sel = Selector(response)
270 log.msg('chemspider parse_searchrequest', level=log.DEBUG)
271 sel.register_namespace('cs', 'http://www.chemspider.com/')
272 csids = sel.xpath('.//cs:int/text()').extract()
273 if len(csids) == 0:
274 log.msg('ChemSpider found nothing', level=log.ERROR)
275 return
276 elif len(csids) > 1:
277 log.msg('ChemSpider found multiple substances, taking first '
278 'element', level=log.DEBUG)
279 csid = csids[0]
280 structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
281 extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
282 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
283 return [Request(url=structure_url,
284 callback=self.parse),
285 Request(url=extendedinfo_url,
286 callback=self.parse_extendedinfo)]
287
288 def new_compound_request(self, compound):
289 """
290 This function is called when a new synonym is returned to the spider
291 to generate new requests
292 :param compound: the name of the compound to search for
293 """
294 if compound in self.ignore_list or self.cfg['token'] == '':
295 return None
296 searchurl = self.website[:-2].replace("\\", "") + self.search % compound
297 log.msg('chemspider compound', level=log.DEBUG)
298 return Request(url=searchurl, callback=self.parse_searchrequest)