A web scraper build to search specific information for a given compound (and its pseudonyms)
at develop 298 lines 12 kB view raw
1import re 2 3from scrapy import log 4from scrapy.http import Request 5from scrapy.selector import Selector 6 7from source import Source 8from FourmiCrawler.items import Result 9 10 11# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. 12 13class ChemSpider(Source): 14 """ 15 ChemSpider scraper for synonyms and properties 16 This parser will manage searching for chemicals through the 17 ChemsSpider API, and parsing the resulting ChemSpider page. 18 The token required for the API should be in a configuration file 19 somewhere. 20 """ 21 22 website = 'http://www\\.chemspider\\.com/.*' 23 24 search = 'Search.asmx/SimpleSearch?query=%s&token=' 25 structure = 'Chemical-Structure.%s.html' 26 extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 27 28 def __init__(self, config=None): 29 """ 30 Initialization of ChemSpider scraper 31 :param config: a dictionary of settings for this scraper, must contain 32 'reliability' key 33 """ 34 Source.__init__(self, config) 35 self.ignore_list = [] 36 if 'token' not in self.cfg or self.cfg['token'] == '': 37 log.msg('ChemSpider token not set or empty, search/MassSpec API ' 38 'not available', level=log.WARNING) 39 self.cfg['token'] = '' 40 self.search += self.cfg['token'] 41 self.extendedinfo += self.cfg['token'] 42 43 def parse(self, response): 44 """ 45 This function is called when a Response matching the variable 46 'website' is available for parsing the Response object. 47 :param response: the Scrapy Response object to be parsed 48 :return: a list of Result items and Request objects 49 """ 50 sel = Selector(response) 51 requests = [] 52 requests_synonyms = self.parse_synonyms(sel) 53 requests.extend(requests_synonyms) 54 requests_properties = self.parse_properties(sel) 55 requests.extend(requests_properties) 56 57 return requests 58 59 def parse_properties(self, sel): 60 """ 61 This function scrapes the Experimental Data and Predicted ACD/Labs tabs 62 :param sel: a Selector object of the whole page 63 :return: a list of Result items 64 """ 65 properties = [] 66 67 properties.extend(self.parse_acdlabstab(sel)) 68 properties.extend(self.parse_experimentaldatatab(sel)) 69 70 return properties 71 72 def parse_acdlabstab(self, sel): 73 """ 74 This function scrapes the 'Predicted ACD/Labs tab' under Properties 75 :param sel: a Selector object of the whole page 76 :return: a list of Request objects 77 """ 78 properties = [] 79 80 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 81 'normalize-space(string())') 82 prop_names = td_list[::2] 83 prop_values = td_list[1::2] 84 for (prop_name, prop_value) in zip(prop_names, prop_values): 85 # [:-1] is to remove the colon at the end, [TODO] - test for colon 86 prop_name = prop_name.extract().encode('utf-8')[:-1] 87 prop_value = prop_value.extract().encode('utf-8') 88 prop_conditions = '' 89 90 # Test for properties without values, with one hardcoded exception 91 if (not re.match(r'^\d', prop_value) or 92 (prop_name == 'Polarizability' and prop_value == '10-24cm3')): 93 continue 94 95 m = re.match(r'(.*) \((.*)\)', prop_name) 96 if m: 97 prop_name = m.group(1) 98 prop_conditions = m.group(2) 99 100 m = re.match(r'(.*) at (.*)', prop_value) 101 if m: 102 prop_value = m.group(1) 103 prop_conditions = m.group(2) 104 105 new_prop = self.newresult( 106 attribute=prop_name, 107 value=prop_value, 108 source='ChemSpider Predicted - ACD/Labs Tab', 109 conditions=prop_conditions 110 ) 111 properties.append(new_prop) 112 113 return properties 114 115 def parse_experimentaldatatab(self, sel): 116 """ 117 This function scrapes Experimental Data tab, Physico-chemical 118 properties in particular. 119 :param sel: a Selector object of the whole page 120 :return: a list of Result items 121 """ 122 properties = [] 123 124 scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' 125 'Properties"]//li/table/tr/td') 126 if not scraped_list: 127 return properties 128 # Format is: property name followed by a list of values 129 property_name = scraped_list.pop(0).xpath( 130 'span/text()').extract()[0].rstrip() 131 for line in scraped_list: 132 if line.xpath('span/text()'): 133 property_name = line.xpath('span/text()').extract()[0].rstrip() 134 else: 135 new_prop = self.newresult( 136 attribute=property_name[:-1], 137 value=line.xpath('text()').extract()[0].rstrip(), 138 source=line.xpath('strong/text()').extract()[0].rstrip(), 139 ) 140 properties.append(new_prop) 141 142 return properties 143 144 def parse_synonyms(self, sel): 145 """ 146 This function scrapes the list of Names and Identifiers 147 :param sel: a Selector object of the whole page 148 :return: a list of Requests 149 """ 150 requests = [] 151 synonyms = [] 152 153 # Exact type for this is unknown, but equivalent to Validated by Expert 154 for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'): 155 name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0] 156 synonyms.append(self.new_synonym(syn, name, 'expert')) 157 # These synonyms are labeled by ChemSpider as "Validated by Experts" 158 for syn in sel.xpath('//p[@class="syn"][strong]'): 159 name = syn.xpath('strong/text()').extract()[0] 160 synonyms.append(self.new_synonym(syn, name, 'expert')) 161 # These synonyms are labeled by ChemSpider as "Validated by Users" 162 for syn in sel.xpath( 163 '//p[@class="syn"][span[@class="synonym_confirmed"]]'): 164 name = syn.xpath( 165 'span[@class="synonym_confirmed"]/text()').extract()[0] 166 synonyms.append(self.new_synonym(syn, name, 'user')) 167 # These syonyms are labeled as "Non-validated" and assumed unreliable 168 for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'): 169 name = syn.xpath('span[@class=""]/text()').extract()[0] 170 synonyms.append(self.new_synonym(syn, name, 'nonvalidated')) 171 172 # [TODO] - confirm if English User-Validated synonyms are OK too 173 for syn in synonyms: 174 if syn['category'] == 'expert' and syn['language'] == 'English': 175 log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG) 176 self._spider.get_synonym_requests(syn['name']) 177 178 return requests 179 180 def new_synonym(self, sel, name, category): 181 """ 182 This function scrapes for a single synonym at a given HTML tag 183 :param sel: a Selector object of the given HTML tag 184 :param name: the name of the synonym in the tag 185 :param category: the name of the category the synonym is labeled as 186 :return: a dictionary containing data on the synonym 187 """ 188 self.ignore_list.append(name) 189 language = sel.xpath('span[@class="synonym_language"]/text()') 190 if language: 191 # The [1:-1] is to remove brackets around the language name 192 language = language.extract()[0][1:-1] 193 else: 194 # If language is not given, English is assumed, [TODO] - confirm 195 language = 'English' 196 log.msg('CS synonym: %s (%s) (%s)' % (name, category, language), 197 level=log.DEBUG) 198 references = [] 199 # A synonym can have multiple references, each optionally with link 200 for ref in sel.xpath('span[@class="synonym_ref"]'): 201 refname = ref.xpath('normalize-space(string())') 202 references.append({ 203 'name': refname.extract()[0][1:-1], 204 'URI': '' 205 }) 206 for ref in sel.xpath('a[@class="synonym_ref"]'): 207 references.append({ 208 'name': ref.xpath('@title').extract()[0], 209 'URI': ref.xpath('@href').extract()[0] 210 }) 211 for ref in references: 212 log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']), 213 level=log.DEBUG) 214 synonym = { 215 'name': name, 216 'category': category, 217 'language': language, 218 'references': references 219 } 220 return synonym 221 222 def parse_extendedinfo(self, response): 223 """ 224 This function scrapes data from the ChemSpider GetExtendedCompoundInfo 225 API, if a token is present in the configuration settings 226 :param response: a Response object to be parsed 227 :return: a list of Result items 228 """ 229 sel = Selector(response) 230 properties = [] 231 names = sel.xpath('*').xpath('name()').extract() 232 values = sel.xpath('*').xpath('text()').extract() 233 for (name, value) in zip(names, values): 234 result = self.newresult( 235 attribute=name, 236 value=value, # These values have no unit! 237 source='ChemSpider ExtendedCompoundInfo', 238 ) 239 if result['value']: 240 properties.append(result) 241 return properties 242 243 def newresult(self, attribute, value, conditions='', source='ChemSpider'): 244 """ 245 This function abstracts from the Result item and provides default 246 values. 247 :param attribute: the name of the attribute 248 :param value: the value of the attribute 249 :param conditions: optional conditions regarding the value 250 :param source: the name of the source if it is not ChemSpider 251 :return: A Result item 252 """ 253 return Result({ 254 'attribute': attribute, 255 'value': value, 256 'source': source, 257 'reliability': self.cfg['reliability'], 258 'conditions': conditions 259 }) 260 261 def parse_searchrequest(self, response): 262 """ 263 This function parses the initial response of the ChemSpider Search API 264 Requires a valid token to function. 265 :param response: the Response object to be parsed 266 :return: A Request for the information page and a Request for the 267 extendedinfo API call 268 """ 269 sel = Selector(response) 270 log.msg('chemspider parse_searchrequest', level=log.DEBUG) 271 sel.register_namespace('cs', 'http://www.chemspider.com/') 272 csids = sel.xpath('.//cs:int/text()').extract() 273 if len(csids) == 0: 274 log.msg('ChemSpider found nothing', level=log.ERROR) 275 return 276 elif len(csids) > 1: 277 log.msg('ChemSpider found multiple substances, taking first ' 278 'element', level=log.DEBUG) 279 csid = csids[0] 280 structure_url = self.website[:-2].replace("\\", "") + self.structure % csid 281 extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid 282 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) 283 return [Request(url=structure_url, 284 callback=self.parse), 285 Request(url=extendedinfo_url, 286 callback=self.parse_extendedinfo)] 287 288 def new_compound_request(self, compound): 289 """ 290 This function is called when a new synonym is returned to the spider 291 to generate new requests 292 :param compound: the name of the compound to search for 293 """ 294 if compound in self.ignore_list or self.cfg['token'] == '': 295 return None 296 searchurl = self.website[:-2].replace("\\", "") + self.search % compound 297 log.msg('chemspider compound', level=log.DEBUG) 298 return Request(url=searchurl, callback=self.parse_searchrequest)