A web scraper build to search specific information for a given compound (and its pseudonyms)
at feature/executable 231 lines 9.6 kB view raw
1import re 2 3from scrapy import log 4from scrapy.http import Request 5from scrapy.selector import Selector 6 7from source import Source 8from FourmiCrawler.items import Result 9 10 11# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. 12 13 14class ChemSpider(Source): 15 """ChemSpider scraper for synonyms and properties 16 17 This parser will manage searching for chemicals through the 18 ChemsSpider API, and parsing the resulting ChemSpider page. 19 The token required for the API should be in a configuration file 20 somewhere. 21 """ 22 23 def __init__(self): 24 Source.__init__(self) 25 26 website = 'http://www.chemspider.com/*' 27 28 # [TODO] - Save and access token of specific user. 29 search = ('Search.asmx/SimpleSearch?query=%s&token=' 30 '052bfd06-5ce4-43d6-bf12-89eabefd2338') 31 structure = 'Chemical-Structure.%s.html' 32 extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 33 '052bfd06-5ce4-43d6-bf12-89eabefd2338') 34 35 ignore_list = [] 36 37 def parse(self, response): 38 sel = Selector(response) 39 requests = [] 40 requests_synonyms = self.parse_synonyms(sel) 41 requests.extend(requests_synonyms) 42 requests_properties = self.parse_properties(sel) 43 requests.extend(requests_properties) 44 45 return requests 46 47 @staticmethod 48 def parse_properties(sel): 49 """scrape Experimental Data and Predicted ACD/Labs tabs""" 50 properties = [] 51 52 # Predicted - ACD/Labs tab 53 td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 54 'normalize-space(string())') 55 prop_names = td_list[::2] 56 prop_values = td_list[1::2] 57 for (prop_name, prop_value) in zip(prop_names, prop_values): 58 # [:-1] is to remove the colon at the end, [TODO] - test for colon 59 prop_name = prop_name.extract().encode('utf-8')[:-1] 60 prop_value = prop_value.extract().encode('utf-8') 61 prop_conditions = '' 62 63 # Test for properties without values, with one hardcoded exception 64 if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'): 65 continue 66 67 # Match for condition in parentheses 68 m = re.match(r'(.*) \((.*)\)', prop_name) 69 if m: 70 prop_name = m.group(1) 71 prop_conditions = m.group(2) 72 73 # Match for condition in value seperated by an 'at' 74 m = re.match(r'(.*) at (.*)', prop_value) 75 if m: 76 prop_value = m.group(1) 77 prop_conditions = m.group(2) 78 79 new_prop = Result({ 80 'attribute': prop_name, 81 'value': prop_value, 82 'source': 'ChemSpider Predicted - ACD/Labs Tab', 83 'reliability': 'Unknown', 84 'conditions': prop_conditions 85 }) 86 properties.append(new_prop) 87 log.msg('CS prop: |%s| |%s| |%s|' % 88 (new_prop['attribute'], new_prop['value'], new_prop['source']), 89 level=log.DEBUG) 90 91 # Experimental Data Tab, Physico-chemical properties in particular 92 scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' 93 'Properties"]//li/table/tr/td') 94 if not scraped_list: 95 return properties 96 # Format is: property name followed by a list of values 97 property_name = scraped_list.pop(0).xpath( 98 'span/text()').extract()[0].rstrip() 99 for line in scraped_list: 100 if line.xpath('span/text()'): 101 property_name = line.xpath('span/text()').extract()[0].rstrip() 102 else: 103 new_prop = Result({ 104 'attribute': property_name[:-1], 105 'value': line.xpath('text()').extract()[0].rstrip(), 106 'source': line.xpath( 107 'strong/text()').extract()[0].rstrip(), 108 'reliability': 'Unknown', 109 'conditions': '' 110 }) 111 properties.append(new_prop) 112 log.msg('CS prop: |%s| |%s| |%s|' % 113 (new_prop['attribute'], new_prop['value'], 114 new_prop['source']), level=log.DEBUG) 115 116 return properties 117 118 def parse_synonyms(self, sel): 119 """Scrape list of Names and Identifiers""" 120 requests = [] 121 synonyms = [] 122 123 # Exact type for this is unknown, but equivalent to Validated by Expert 124 for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'): 125 name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0] 126 synonyms.append(self.new_synonym(syn, name, 'expert')) 127 # These synonyms are labeled by ChemSpider as "Validated by Experts" 128 for syn in sel.xpath('//p[@class="syn"][strong]'): 129 name = syn.xpath('strong/text()').extract()[0] 130 synonyms.append(self.new_synonym(syn, name, 'expert')) 131 # These synonyms are labeled by ChemSpider as "Validated by Users" 132 for syn in sel.xpath( 133 '//p[@class="syn"][span[@class="synonym_confirmed"]]'): 134 name = syn.xpath( 135 'span[@class="synonym_confirmed"]/text()').extract()[0] 136 synonyms.append(self.new_synonym(syn, name, 'user')) 137 # These syonyms are labeled as "Non-validated" and assumed unreliable 138 for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'): 139 name = syn.xpath('span[@class=""]/text()').extract()[0] 140 synonyms.append(self.new_synonym(syn, name, 'nonvalidated')) 141 142 # [TODO] - confirm if English User-Validated synonyms are OK too 143 for syn in synonyms: 144 if syn['category'] == 'expert' and syn['language'] == 'English': 145 log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG) 146 self._spider.get_synonym_requests(syn['name']) 147 148 return requests 149 150 def new_synonym(self, sel, name, category): 151 """Scrape for a single synonym at a given HTML tag""" 152 self.ignore_list.append(name) 153 language = sel.xpath('span[@class="synonym_language"]/text()') 154 if language: 155 # The [1:-1] is to remove brackets around the language name 156 language = language.extract()[0][1:-1] 157 else: 158 # If language is not given, English is assumed, [TODO] - confirm 159 language = 'English' 160 log.msg('CS synonym: %s (%s) (%s)' % (name, category, language), 161 level=log.DEBUG) 162 references = [] 163 # A synonym can have multiple references, each optionally with link 164 for ref in sel.xpath('span[@class="synonym_ref"]'): 165 refname = ref.xpath('normalize-space(string())') 166 references.append({ 167 'name': refname.extract()[0][1:-1], 168 'URI': '' 169 }) 170 for ref in sel.xpath('a[@class="synonym_ref"]'): 171 references.append({ 172 'name': ref.xpath('@title').extract()[0], 173 'URI': ref.xpath('@href').extract()[0] 174 }) 175 for ref in references: 176 log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']), 177 level=log.DEBUG) 178 synonym = { 179 'name': name, 180 'category': category, 181 'language': language, 182 'references': references 183 } 184 return synonym 185 186 @staticmethod 187 def parse_extendedinfo(response): 188 """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" 189 sel = Selector(response) 190 properties = [] 191 names = sel.xpath('*').xpath('name()').extract() 192 values = sel.xpath('*').xpath('text()').extract() 193 for (name, value) in zip(names, values): 194 result = Result({ 195 'attribute': name, 196 'value': value, # These values have no unit! 197 'source': 'ChemSpider ExtendedCompoundInfo', 198 'reliability': 'Unknown', 199 'conditions': '' 200 }) 201 if result['value']: 202 properties.append(result) 203 return properties 204 205 def parse_searchrequest(self, response): 206 """Parse the initial response of the ChemSpider Search API """ 207 sel = Selector(response) 208 log.msg('chemspider parse_searchrequest', level=log.DEBUG) 209 sel.register_namespace('cs', 'http://www.chemspider.com/') 210 csids = sel.xpath('.//cs:int/text()').extract() 211 if len(csids) == 0: 212 log.msg('ChemSpider found nothing', level=log.ERROR) 213 return 214 elif len(csids) > 1: 215 log.msg('ChemSpider found multiple substances, taking first ' 216 'element', level=log.DEBUG) 217 csid = csids[0] 218 structure_url = self.website[:-1] + self.structure % csid 219 extendedinfo_url = self.website[:-1] + self.extendedinfo % csid 220 log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) 221 return [Request(url=structure_url, 222 callback=self.parse), 223 Request(url=extendedinfo_url, 224 callback=self.parse_extendedinfo)] 225 226 def new_compound_request(self, compound): 227 if compound in self.ignore_list: # [TODO] - add regular expression 228 return None 229 searchurl = self.website[:-1] + self.search % compound 230 log.msg('chemspider compound', level=log.DEBUG) 231 return Request(url=searchurl, callback=self.parse_searchrequest)