···2121# Crawl responsibly by identifying yourself (and your website) on the
2222# user-agent
23232424-# [todo] - Check for repercussions on spoofing the user agent
2525-2626-# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
2727-USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
2424+USER_AGENT = 'Fourmi'
···1818 This plugin manages searching for a chemical on the NIST website
1919 and parsing the resulting page if the chemical exists on NIST.
2020 """
2121- website = "http://webbook.nist.gov/*"
2121+ website = "http://webbook\\.nist\\.gov/.*"
22222323 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
2424···164164 extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
165165 '/a/@href').extract()
166166 if extra_data_url:
167167- request = Request(url=self.website[:-1] + extra_data_url[0],
167167+ request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
168168 callback=self.parse_individual_datapoints)
169169 results.append(request)
170170 continue
···329329 """
330330 if compound not in self.ignore_list:
331331 self.ignore_list.update(compound)
332332- return Request(url=self.website[:-1] + self.search % compound,
332332+ return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
333333 callback=self.parse)
+45-8
FourmiCrawler/sources/PubChem.py
···11+import re
22+13from scrapy.http import Request
24from scrapy import log
33-from source import Source
45from scrapy.selector import Selector
66+77+from source import Source
58from FourmiCrawler.items import Result
66-import re
79810911class PubChem(Source):
···1416 """
15171618 #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
1717- website = 'https://*.ncbi.nlm.nih.gov/*'
1818- website_www = 'https://www.ncbi.nlm.nih.gov/*'
1919- website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
1919+ website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
2020+ website_www = 'http://www.ncbi.nlm.nih.gov/*'
2121+ website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
2022 search = 'pccompound?term=%s'
2123 data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
2224···4951 self._spider.get_synonym_requests(synonym)
5052 log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
51535252- n = re.search(r'cid=(\d+)',response.url)
5454+ n = re.search(r'cid=(\d+)', response.url)
5355 if n:
5456 cid = n.group(1)
5557 log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
5658 # the seperate html page which contains the properties and their values
57595860 #using this cid to get the right url and scrape it
5959- requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
6161+ requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data))
6062 return requests
61636264 def parse_data(self, response):
···106108107109 return requests
108110111111+ def parse_searchrequest(self, response):
112112+ """
113113+ This function parses the response to the new_compound_request Request
114114+ :param response: the Response object to be parsed
115115+ :return: A Request for the compound page or what self.parse returns in
116116+ case the search request forwarded to the compound page
117117+ """
118118+119119+ #check if pubchem forwarded straight to compound page
120120+ m = re.match(self.website_pubchem, response.url)
121121+ if m:
122122+ log.msg('PubChem search forwarded to compound page',
123123+ level=log.DEBUG)
124124+ return self.parse(response)
125125+126126+ sel = Selector(response)
127127+128128+ results = sel.xpath('//div[@class="rsltcont"]')
129129+ if results:
130130+ url = results[0].xpath('div/p/a[1]/@href')
131131+ else:
132132+ log.msg('PubChem search found nothing or xpath failed',
133133+ level=log.DEBUG)
134134+ return None
135135+136136+ if url:
137137+ url = 'http:' + ''.join(url[0].extract())
138138+ log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
139139+ else:
140140+ log.msg('PubChem search found results, but no url in first result',
141141+ level=log.DEBUG)
142142+ return None
143143+144144+ return Request(url=url, callback=self.parse)
109145110146 def new_compound_request(self, compound):
111111- return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
147147+ return Request(url=self.website_www[:-1] + self.search % compound,
148148+ callback=self.parse_searchrequest)
+2-2
FourmiCrawler/sources/WikipediaParser.py
···1515 It also returns requests with other external sources which contain information on parsed subject.
1616 """
17171818- website = "http://en.wikipedia.org/wiki/*"
1818+ website = "http://en\\.wikipedia\\.org/wiki/.*"
1919 __spider = None
2020 searched_compounds = []
2121···123123 return items
124124125125 def new_compound_request(self, compound):
126126- return Request(url=self.website[:-1] + compound, callback=self.parse)
126126+ return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
127127128128 @staticmethod
129129 def clean_items(items):
+2-2
FourmiCrawler/sources/source.py
···334455class Source:
66- website = "http://something/*" # Regex of URI's the source is able to parse
66+ website = "http://something/.*" # Regex of URI's the source is able to parse
77 _spider = None
8899 def __init__(self, config=None):
···3030 :param compound: A compound name.
3131 :return: A new Scrapy Request
3232 """
3333- # return Request(url=self.website[:-1] + compound, callback=self.parse)
3333+ # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
3434 pass
35353636 def set_spider(self, spider):