A web scraper build to search specific information for a given compound (and its pseudonyms)
at develop 169 lines 6.9 kB view raw
1import re 2 3from scrapy.http import Request 4from scrapy import log 5from scrapy.selector import Selector 6 7from source import Source 8from FourmiCrawler.items import Result 9 10 11class WikipediaParser(Source): 12 """ Wikipedia scraper for chemical properties 13 14 This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. 15 It also returns requests with other external sources which contain information on parsed subject. 16 """ 17 18 website = "http://en\\.wikipedia\\.org/wiki/.*" 19 __spider = None 20 searched_compounds = [] 21 22 def __init__(self, config=None): 23 Source.__init__(self, config) 24 25 def parse(self, response): 26 """ 27 Distributes the above described behaviour 28 :param response: The incoming search request 29 :return: Returns the found properties if response is unique or returns none if it's already known 30 """ 31 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 32 sel = Selector(response) 33 compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page 34 if compound in self.searched_compounds: 35 return None 36 else: 37 items = self.parse_infobox(sel) 38 self.searched_compounds.append(compound) 39 return items 40 41 def parse_infobox(self, sel): 42 """ 43 Scrape data from infobox on wikipedia. 44 45 Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and 46 :param sel: The selector with the html-information of the page to parse 47 :return: item_list: Returns a list of properties with their values, source, etc.. 48 """ 49 50 items = [] 51 52 # scrape the chembox (wikipedia template) 53 items = self.parse_chembox(sel, items) 54 55 # scrape the drugbox (wikipedia template) 56 items = self.parse_drugbox(sel, items) 57 58 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 59 item_list = self.clean_items(items) 60 61 identifiers = self.get_identifiers(sel) 62 63 #add extra sources to scrape from as requests 64 for i, identifier in enumerate(identifiers): 65 request = None 66 #discard internal wikipedia links 67 if re.match('//en\.wikipedia', identifier): 68 log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) 69 #fix links starting with '//www.' 70 elif re.match('/{2}', identifier): 71 identifier = re.sub("/{2}", "http://", identifier) 72 request = Request(identifier) 73 else: 74 request = Request(identifier) 75 log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG) 76 item_list.append(request) 77 78 return item_list 79 80 def parse_chembox(self, sel, items): 81 """ 82 Scrape data from chembox infobox on wikipedia. 83 84 :param sel: The selector with the html-information of the page to parse 85 :param items: the list of items where the result have to be stored in 86 :return: items: the list of items with the new found and stored items 87 """ 88 tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 89 xpath('normalize-space(string())') 90 prop_names = tr_list[::2] 91 prop_values = tr_list[1::2] 92 for i, prop_name in enumerate(prop_names): 93 item = self.newresult( 94 attribute=prop_name.extract().encode('utf-8'), 95 value=prop_values[i].extract().encode('utf-8') 96 ) 97 items.append(item) 98 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 99 return items 100 101 def parse_drugbox(self, sel, items): 102 """ 103 Scrape data from drugbox infobox on wikipedia. 104 105 :param sel: The selector with the html-information of the page to parse 106 :param items: the list of items where the result have to be stored in 107 :return: items: the list of items with the new found and stored items 108 """ 109 tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 110 log.msg('dit: %s' % tr_list2, level=log.DEBUG) 111 for tablerow in tr_list2: 112 log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 113 if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 114 'normalize-space(string())'): 115 item = self.newresult( 116 attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 117 value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 118 ) 119 items.append(item) 120 log.msg( 121 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 122 level=log.DEBUG) 123 return items 124 125 def new_compound_request(self, compound): 126 return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) 127 128 @staticmethod 129 def clean_items(items): 130 131 """ 132 Clean up properties using regex, makes it possible to split the values from the units 133 134 Almost not in use, only cleans J/K/mol values and boiling/melting points. 135 136 :param items: List of properties with their values, source, etc.. 137 :return: items: List of now cleaned up items 138 """ 139 for item in items: 140 value = item['value'] 141 m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) 142 if m: 143 item['value'] = m.group(1) + " K" 144 m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values 145 if m: 146 item['value'] = m.group(1) + " J/K/mol" 147 return items 148 149 @staticmethod 150 def get_identifiers(sel): 151 """ 152 Find external links, named 'Identifiers' to different sources. 153 154 :param sel: The selector with the html-information of the page to parse 155 :return: links: New links which can be used to expand the crawlers search 156 """ 157 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' 158 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() 159 return links 160 161 def newresult(self, attribute, value): 162 return Result( 163 { 164 'attribute': attribute, 165 'value': value, 166 'source': 'Wikipedia', 167 'reliability': self.cfg['reliability'], 168 'conditions': '' 169 })