A web scraper build to search specific information for a given compound (and its pseudonyms)
1import re
2
3from scrapy.http import Request
4from scrapy import log
5from scrapy.selector import Selector
6
7from source import Source
8from FourmiCrawler.items import Result
9
10
11class WikipediaParser(Source):
12 """ Wikipedia scraper for chemical properties
13
14 This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
15 It also returns requests with other external sources which contain information on parsed subject.
16 """
17
18 website = "http://en\\.wikipedia\\.org/wiki/.*"
19 __spider = None
20 searched_compounds = []
21
22 def __init__(self, config=None):
23 Source.__init__(self, config)
24
25 def parse(self, response):
26 """
27 Distributes the above described behaviour
28 :param response: The incoming search request
29 :return: Returns the found properties if response is unique or returns none if it's already known
30 """
31 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
32 sel = Selector(response)
33 compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page
34 if compound in self.searched_compounds:
35 return None
36 else:
37 items = self.parse_infobox(sel)
38 self.searched_compounds.append(compound)
39 return items
40
41 def parse_infobox(self, sel):
42 """
43 Scrape data from infobox on wikipedia.
44
45 Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
46 :param sel: The selector with the html-information of the page to parse
47 :return: item_list: Returns a list of properties with their values, source, etc..
48 """
49
50 items = []
51
52 # scrape the chembox (wikipedia template)
53 items = self.parse_chembox(sel, items)
54
55 # scrape the drugbox (wikipedia template)
56 items = self.parse_drugbox(sel, items)
57
58 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value
59 item_list = self.clean_items(items)
60
61 identifiers = self.get_identifiers(sel)
62
63 #add extra sources to scrape from as requests
64 for i, identifier in enumerate(identifiers):
65 request = None
66 #discard internal wikipedia links
67 if re.match('//en\.wikipedia', identifier):
68 log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING)
69 #fix links starting with '//www.'
70 elif re.match('/{2}', identifier):
71 identifier = re.sub("/{2}", "http://", identifier)
72 request = Request(identifier)
73 else:
74 request = Request(identifier)
75 log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG)
76 item_list.append(request)
77
78 return item_list
79
80 def parse_chembox(self, sel, items):
81 """
82 Scrape data from chembox infobox on wikipedia.
83
84 :param sel: The selector with the html-information of the page to parse
85 :param items: the list of items where the result have to be stored in
86 :return: items: the list of items with the new found and stored items
87 """
88 tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
89 xpath('normalize-space(string())')
90 prop_names = tr_list[::2]
91 prop_values = tr_list[1::2]
92 for i, prop_name in enumerate(prop_names):
93 item = self.newresult(
94 attribute=prop_name.extract().encode('utf-8'),
95 value=prop_values[i].extract().encode('utf-8')
96 )
97 items.append(item)
98 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
99 return items
100
101 def parse_drugbox(self, sel, items):
102 """
103 Scrape data from drugbox infobox on wikipedia.
104
105 :param sel: The selector with the html-information of the page to parse
106 :param items: the list of items where the result have to be stored in
107 :return: items: the list of items with the new found and stored items
108 """
109 tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
110 log.msg('dit: %s' % tr_list2, level=log.DEBUG)
111 for tablerow in tr_list2:
112 log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
113 if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
114 'normalize-space(string())'):
115 item = self.newresult(
116 attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
117 value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
118 )
119 items.append(item)
120 log.msg(
121 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
122 level=log.DEBUG)
123 return items
124
125 def new_compound_request(self, compound):
126 return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
127
128 @staticmethod
129 def clean_items(items):
130
131 """
132 Clean up properties using regex, makes it possible to split the values from the units
133
134 Almost not in use, only cleans J/K/mol values and boiling/melting points.
135
136 :param items: List of properties with their values, source, etc..
137 :return: items: List of now cleaned up items
138 """
139 for item in items:
140 value = item['value']
141 m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F)
142 if m:
143 item['value'] = m.group(1) + " K"
144 m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value) # clean up J/K/mol values
145 if m:
146 item['value'] = m.group(1) + " J/K/mol"
147 return items
148
149 @staticmethod
150 def get_identifiers(sel):
151 """
152 Find external links, named 'Identifiers' to different sources.
153
154 :param sel: The selector with the html-information of the page to parse
155 :return: links: New links which can be used to expand the crawlers search
156 """
157 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
158 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
159 return links
160
161 def newresult(self, attribute, value):
162 return Result(
163 {
164 'attribute': attribute,
165 'value': value,
166 'source': 'Wikipedia',
167 'reliability': self.cfg['reliability'],
168 'conditions': ''
169 })