···1515 including sources of the values of properties.
1616 """
17171818- #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
1818+ # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
1919 website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
2020 website_www = 'http://www.ncbi.nlm.nih.gov/*'
2121 website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
···5454 n = re.search(r'cid=(\d+)', response.url)
5555 if n:
5656 cid = n.group(1)
5757- log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
5858- # the seperate html page which contains the properties and their values
5757+ log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
5858+ # the seperate html page which contains the properties and their values
59596060- #using this cid to get the right url and scrape it
6161- requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data))
6060+ # using this cid to get the right url and scrape it
6161+ requests.append(
6262+ Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
6263 return requests
63646464- def parse_data(self, response):
6565+ @staticmethod
6666+ def parse_data(response):
6567 """
6668 Parse data found in 'Chemical and Physical properties' part of a substance page.
6769 :param response: The response with the page to parse
···7476 props = sel.xpath('//div')
75777678 for prop in props:
7777- prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
7878- if prop.xpath('a'): # parsing for single value in property
7979+ prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
8080+ if prop.xpath('a'): # parsing for single value in property
7981 prop_source = ''.join(prop.xpath('a/@title').extract())
8082 prop_value = ''.join(prop.xpath('a/text()').extract())
8183 new_prop = Result({
···8991 (new_prop['attribute'], new_prop['value'],
9092 new_prop['source']), level=log.DEBUG)
9193 requests.append(new_prop)
9292- elif prop.xpath('ul'): # parsing for multiple values (list) in property
9494+ elif prop.xpath('ul'): # parsing for multiple values (list) in property
9395 prop_values = prop.xpath('ul//li')
9496 for prop_li in prop_values:
9597 prop_value = ''.join(prop_li.xpath('a/text()').extract())
···102104 'conditions': ''
103105 })
104106 log.msg('PubChem prop: |%s| |%s| |%s|' %
105105- (new_prop['attribute'], new_prop['value'],
106106- new_prop['source']), level=log.DEBUG)
107107+ (new_prop['attribute'], new_prop['value'],
108108+ new_prop['source']), level=log.DEBUG)
107109 requests.append(new_prop)
108110109111 return requests
···116118 case the search request forwarded to the compound page
117119 """
118120119119- #check if pubchem forwarded straight to compound page
121121+ # check if pubchem forwarded straight to compound page
120122 m = re.match(self.website_pubchem, response.url)
121123 if m:
122124 log.msg('PubChem search forwarded to compound page',
+3-3
fourmi.py
···1717 --version Show version.
1818 -v Verbose logging output. (Multiple occurrences increase logging level)
1919 --log=<file> Save log to an file.
2020- -o <file> --output=<file> Output file [default: results.*format*]
2020+ -o <file> --output=<file> Output file [default: <compound>.*format*]
2121 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
2222 --include=<regex> Include only sources that match these regular expressions split by a comma.
2323 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
···5858 """
5959 conf = Configurator()
6060 conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
6161- conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
6161+ conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
6262 setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
6363 source_loader, docopt_arguments["--attributes"].split(','))
6464 if conf.scrapy_settings.getbool("LOG_ENABLED"):
6565 log.start(conf.scrapy_settings.get("LOG_FILE"),
6666- conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
6666+ conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT"))
6767 reactor.run()
68686969
···3344from scrapy.utils.project import get_project_settings
5566+67class Configurator:
78 """
89 A helper class in the fourmi class. This class is used to process the settings as set
···1213 def __init__(self):
1314 self.scrapy_settings = get_project_settings()
14151515- def set_output(self, filename, fileformat):
1616+ def set_output(self, filename, fileformat, compound):
1617 """
1718 This function manipulates the Scrapy output file settings that normally would be set in the settings file.
1819 In the Fourmi project these are command line arguments.
···2021 :param fileformat: The format in which the output will be.
2122 """
22232323- if filename != 'results.*format*':
2424+ if filename != '<compound>.*format*':
2425 self.scrapy_settings.overrides["FEED_URI"] = filename
2526 elif fileformat == "jsonlines":
2626- self.scrapy_settings.overrides["FEED_URI"] = "results.json"
2727+ self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
2728 elif fileformat is not None:
2828- self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
2929+ self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
29303031 if fileformat is not None:
3132 self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat