A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'release/v0.5.0'

+276 -168
+3
.gitignore
··· 4 4 #Python Specific ignores 5 5 *.pyc 6 6 7 + #may contain authentication information 8 + sources.cfg 9 + 7 10 #THINGS WE WOULD NEVER EVER WANT! 8 11 #ignore thumbnails created by windows 9 12 Thumbs.db
+1 -1
.travis.yml
··· 10 10 11 11 # command to run tests, e.g. python setup.py test 12 12 script: 13 - - nosetests --with-coverage --cover-package=FourmiCrawler tests 13 + - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests 14 14 15 15 notifications: 16 16 slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
+42 -37
FourmiCrawler/sources/ChemSpider.py
··· 9 9 10 10 11 11 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. 12 - 12 + # [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not 13 13 14 14 class ChemSpider(Source): 15 15 """ChemSpider scraper for synonyms and properties ··· 19 19 The token required for the API should be in a configuration file 20 20 somewhere. 21 21 """ 22 - 23 - def __init__(self): 24 - Source.__init__(self) 25 22 26 23 website = 'http://www.chemspider.com/*' 27 24 28 - # [TODO] - Save and access token of specific user. 29 - search = ('Search.asmx/SimpleSearch?query=%s&token=' 30 - '052bfd06-5ce4-43d6-bf12-89eabefd2338') 25 + search = 'Search.asmx/SimpleSearch?query=%s&token=' 31 26 structure = 'Chemical-Structure.%s.html' 32 - extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 33 - '052bfd06-5ce4-43d6-bf12-89eabefd2338') 27 + extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 34 28 35 - ignore_list = [] 29 + def __init__(self, config={}): 30 + Source.__init__(self, config) 31 + self.cfg = config 32 + self.ignore_list = [] 33 + if 'token' not in self.cfg or self.cfg['token'] == '': 34 + log.msg('ChemSpider token not set or empty, search/MassSpec API ' 35 + 'not available', level=log.WARNING) 36 + self.cfg['token'] = '' 37 + self.search += self.cfg['token'] 38 + self.extendedinfo += self.cfg['token'] 39 + 36 40 37 41 def parse(self, response): 38 42 sel = Selector(response) ··· 44 48 45 49 return requests 46 50 47 - @staticmethod 48 - def parse_properties(sel): 51 + def parse_properties(self, sel): 49 52 """scrape Experimental Data and Predicted ACD/Labs tabs""" 50 53 properties = [] 51 54 ··· 76 79 prop_value = m.group(1) 77 80 prop_conditions = m.group(2) 78 81 79 - new_prop = Result({ 80 - 'attribute': prop_name, 81 - 'value': prop_value, 82 - 'source': 'ChemSpider Predicted - ACD/Labs Tab', 83 - 'reliability': 'Unknown', 84 - 'conditions': prop_conditions 85 - }) 82 + new_prop = self.newresult( 83 + attribute=prop_name, 84 + value=prop_value, 85 + source='ChemSpider Predicted - ACD/Labs Tab', 86 + conditions=prop_conditions 87 + ) 86 88 properties.append(new_prop) 87 89 log.msg('CS prop: |%s| |%s| |%s|' % 88 90 (new_prop['attribute'], new_prop['value'], new_prop['source']), ··· 100 102 if line.xpath('span/text()'): 101 103 property_name = line.xpath('span/text()').extract()[0].rstrip() 102 104 else: 103 - new_prop = Result({ 104 - 'attribute': property_name[:-1], 105 - 'value': line.xpath('text()').extract()[0].rstrip(), 106 - 'source': line.xpath( 107 - 'strong/text()').extract()[0].rstrip(), 108 - 'reliability': 'Unknown', 109 - 'conditions': '' 110 - }) 105 + new_prop = self.newresult( 106 + attribute=property_name[:-1], 107 + value=line.xpath('text()').extract()[0].rstrip(), 108 + source=line.xpath('strong/text()').extract()[0].rstrip(), 109 + ) 111 110 properties.append(new_prop) 112 111 log.msg('CS prop: |%s| |%s| |%s|' % 113 112 (new_prop['attribute'], new_prop['value'], ··· 183 182 } 184 183 return synonym 185 184 186 - @staticmethod 187 - def parse_extendedinfo(response): 185 + def parse_extendedinfo(self, response): 188 186 """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" 189 187 sel = Selector(response) 190 188 properties = [] 191 189 names = sel.xpath('*').xpath('name()').extract() 192 190 values = sel.xpath('*').xpath('text()').extract() 193 191 for (name, value) in zip(names, values): 194 - result = Result({ 195 - 'attribute': name, 196 - 'value': value, # These values have no unit! 197 - 'source': 'ChemSpider ExtendedCompoundInfo', 198 - 'reliability': 'Unknown', 199 - 'conditions': '' 200 - }) 192 + result = self.newresult( 193 + attribute=name, 194 + value=value, # These values have no unit! 195 + source='ChemSpider ExtendedCompoundInfo', 196 + ) 201 197 if result['value']: 202 198 properties.append(result) 203 199 return properties 200 + 201 + def newresult(self, attribute, value, conditions='', source='ChemSpider'): 202 + return Result({ 203 + 'attribute': attribute, 204 + 'value': value, 205 + 'source': source, 206 + 'reliability': self.cfg['reliability'], 207 + 'conditions': conditions 208 + }) 204 209 205 210 def parse_searchrequest(self, response): 206 211 """Parse the initial response of the ChemSpider Search API """ ··· 224 229 callback=self.parse_extendedinfo)] 225 230 226 231 def new_compound_request(self, compound): 227 - if compound in self.ignore_list: # [TODO] - add regular expression 232 + if compound in self.ignore_list or self.cfg['token'] == '': 228 233 return None 229 234 searchurl = self.website[:-1] + self.search % compound 230 235 log.msg('chemspider compound', level=log.DEBUG)
+47 -53
FourmiCrawler/sources/NIST.py
··· 22 22 23 23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 24 25 - ignore_list = set() 25 + cfg = {} 26 26 27 - def __init__(self): 28 - Source.__init__(self) 27 + def __init__(self, config={}): 28 + Source.__init__(self, config) 29 + self.ignore_list = set() 30 + self.cfg = config 29 31 30 32 def parse(self, response): 31 33 sel = Selector(response) ··· 114 116 115 117 requests = [] 116 118 for key, value in data.iteritems(): 117 - result = Result({ 118 - 'attribute': key, 119 - 'value': value, 120 - 'source': 'NIST', 121 - 'reliability': 'Unknown', 122 - 'conditions': '' 123 - }) 119 + result = self.newresult( 120 + attribute=key, 121 + value=value 122 + ) 124 123 requests.append(result) 125 124 126 125 return requests ··· 150 149 name = m.group(1) 151 150 condition = m.group(2) 152 151 153 - result = Result({ 154 - 'attribute': name, 155 - 'value': data[1] + ' ' + data[2], 156 - 'source': 'NIST', 157 - 'reliability': 'Unknown', 158 - 'conditions': condition 159 - }) 152 + result = self.newresult( 153 + attribute=name, 154 + value=data[1] + ' ' + data[2], 155 + conditions=condition 156 + ) 160 157 log.msg('NIST: |%s|' % data, level=log.DEBUG) 161 158 results.append(result) 162 159 return results 163 160 164 - @staticmethod 165 - def parse_transition_data(table, summary): 161 + def parse_transition_data(self, table, summary): 166 162 """Parses the table containing properties regarding phase changes""" 167 163 results = [] 168 164 ··· 174 170 175 171 for tr in table.xpath('tr[td]'): 176 172 tds = tr.xpath('td/text()').extract() 177 - result = Result({ 178 - 'attribute': summary, 179 - 'value': tds[0] + ' ' + unit, 180 - 'source': 'NIST', 181 - 'reliability': 'Unknown', 182 - 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) 183 - }) 173 + result = self.newresult( 174 + attribute=summary, 175 + value=tds[0] + ' ' + unit, 176 + conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) 177 + ) 184 178 results.append(result) 185 179 186 180 return results 187 181 188 - @staticmethod 189 - def parse_generic_data(table, summary): 182 + def parse_generic_data(self, table, summary): 190 183 """Parses the common tables of 4 and 5 rows. Assumes they are of the 191 184 form: 192 185 Symbol (unit)|Temperature (K)|Method|Reference|Comment ··· 202 195 203 196 for tr in table.xpath('tr[td]'): 204 197 tds = tr.xpath('td/text()').extract() 205 - result = Result({ 206 - 'attribute': summary, 207 - 'value': tds[0] + ' ' + unit, 208 - 'source': 'NIST', 209 - 'reliability': 'Unknown', 210 - 'conditions': '%s K' % tds[1] 211 - }) 198 + result = self.newresult( 199 + attribute=summary, 200 + value=tds[0] + ' ' + unit, 201 + conditions='%s K' % tds[1] 202 + ) 212 203 results.append(result) 213 204 return results 214 205 215 - @staticmethod 216 - def parse_antoine_data(table, summary): 206 + def parse_antoine_data(self, table, summary): 217 207 """Parse table containing parameters for the Antione equation""" 218 208 results = [] 219 209 220 210 for tr in table.xpath('tr[td]'): 221 211 tds = tr.xpath('td/text()').extract() 222 - result = Result({ 223 - 'attribute': summary, 224 - 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), 225 - 'source': 'NIST', 226 - 'reliability': 'Unknown', 227 - 'conditions': '%s K' % tds[0] 228 - }) 212 + result = self.newresult( 213 + attribute=summary, 214 + value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), 215 + conditions='%s K' % tds[0] 216 + ) 229 217 results.append(result) 230 218 231 219 return results 232 220 233 - @staticmethod 234 - def parse_individual_datapoints(response): 221 + def parse_individual_datapoints(self, response): 235 222 """Parses the page linked from aggregate data""" 236 223 sel = Selector(response) 237 224 table = sel.xpath('//table[@class="data"]')[0] ··· 258 245 if m: 259 246 uncertainty = '+- %s ' % m.group(1) 260 247 # [TODO]: get the plusminus sign working in here 261 - result = Result({ 262 - 'attribute': name, 263 - 'value': '%s %s%s' % (tds[0], uncertainty, unit), 264 - 'source': 'NIST', 265 - 'reliability': 'Unknown', 266 - 'conditions': condition 267 - }) 248 + result = self.newresult( 249 + attribute=name, 250 + value='%s %s%s' % (tds[0], uncertainty, unit), 251 + conditions=condition 252 + ) 268 253 results.append(result) 269 254 270 255 return results 256 + 257 + def newresult(self, attribute, value, conditions=''): 258 + return Result({ 259 + 'attribute': attribute, 260 + 'value': value, 261 + 'source': 'NIST', 262 + 'reliability': self.cfg['reliability'], 263 + 'conditions': conditions 264 + }) 271 265 272 266 def new_compound_request(self, compound): 273 267 if compound not in self.ignore_list:
+23 -17
FourmiCrawler/sources/WikipediaParser.py
··· 19 19 __spider = None 20 20 searched_compounds = [] 21 21 22 - def __init__(self): 23 - Source.__init__(self) 22 + cfg = {} 23 + 24 + def __init__(self, config={}): 25 + Source.__init__(self, config) 26 + self.cfg = config 24 27 25 28 def parse(self, response): 26 29 """ Distributes the above described behaviour """ ··· 44 47 prop_names = tr_list[::2] 45 48 prop_values = tr_list[1::2] 46 49 for i, prop_name in enumerate(prop_names): 47 - item = Result({ 48 - 'attribute': prop_name.extract().encode('utf-8'), 49 - 'value': prop_values[i].extract().encode('utf-8'), 50 - 'source': "Wikipedia", 51 - 'reliability': "Unknown", 52 - 'conditions': "" 53 - }) 50 + item = self.newresult( 51 + attribute=prop_name.extract().encode('utf-8'), 52 + value=prop_values[i].extract().encode('utf-8') 53 + ) 54 54 items.append(item) 55 55 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 56 56 ··· 61 61 log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) 62 62 if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 63 63 'normalize-space(string())'): 64 - item = Result({ 65 - 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 66 - 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 67 - 'source': "Wikipedia", 68 - 'reliability': "Unknown", 69 - 'conditions': "" 70 - }) 64 + item = self.newresult( 65 + attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 66 + value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), 67 + ) 71 68 items.append(item) 72 69 log.msg( 73 70 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), ··· 116 113 """ find external links, named 'Identifiers' to different sources. """ 117 114 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' 118 115 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() 119 - return links 116 + return links 117 + 118 + def newresult(self, attribute, value): 119 + return Result({ 120 + 'attribute': attribute, 121 + 'value': value, 122 + 'source': 'Wikipedia', 123 + 'reliability': self.cfg['reliability'], 124 + 'conditions': '' 125 + })
+1 -1
FourmiCrawler/sources/source.py
··· 6 6 website = "http://something/*" # Regex of URI's the source is able to parse 7 7 _spider = None 8 8 9 - def __init__(self): 9 + def __init__(self, config={}): 10 10 """ 11 11 Initiation of a new Source 12 12 """
+2 -2
FourmiCrawler/spider.py
··· 9 9 A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. 10 10 """ 11 11 name = "FourmiSpider" 12 - _sources = [] 13 - synonyms = set() 14 12 15 13 def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): 16 14 """ ··· 18 16 :param compound: compound that will be searched. 19 17 :param selected_attributes: A list of regular expressions that the attributes should match. 20 18 """ 19 + self._sources = [] 20 + self.synonyms = set() 21 21 super(FourmiSpider, self).__init__(*args, **kwargs) 22 22 self.synonyms.add(compound) 23 23 self.selected_attributes = selected_attributes
+2 -2
README.md
··· 1 1 # Fourmi 2 2 3 - **Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) 3 + **Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master) 4 4 5 - **Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) 5 + **Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop) 6 6 7 7 Fourmi is an web scraper for chemical substances. The program is designed to be 8 8 used as a search engine to search multiple chemical databases for a specific
+9 -45
fourmi.py
··· 17 17 --version Show version. 18 18 --verbose Verbose logging output. 19 19 --log=<file> Save log to an file. 20 - -o <file> --output=<file> Output file [default: result.*format*] 21 - -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] 20 + -o <file> --output=<file> Output file [default: results.*format*] 21 + -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] 22 22 --include=<regex> Include only sources that match these regular expressions split by a comma. 23 23 --exclude=<regex> Exclude the sources that match these regular expressions split by a comma. 24 24 """ ··· 30 30 import docopt 31 31 32 32 from FourmiCrawler.spider import FourmiSpider 33 - from sourceloader import SourceLoader 33 + from utils.configurator import Configurator 34 + from utils.sourceloader import SourceLoader 34 35 35 36 36 37 def setup_crawler(compound, settings, source_loader, attributes): ··· 50 51 crawler.start() 51 52 52 53 53 - def scrapy_settings_manipulation(docopt_arguments): 54 - """ 55 - This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi 56 - project these are command line arguments. 57 - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 58 - """ 59 - settings = get_project_settings() 60 - 61 - if docopt_arguments["--output"] != 'result.*format*': 62 - settings.overrides["FEED_URI"] = docopt_arguments["--output"] 63 - elif docopt_arguments["--format"] == "jsonlines": 64 - settings.overrides["FEED_URI"] = "results.json" 65 - elif docopt_arguments["--format"] is not None: 66 - settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"] 67 - 68 - if docopt_arguments["--format"] is not None: 69 - settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"] 70 - 71 - return settings 72 - 73 - 74 - def start_log(docopt_arguments): 75 - """ 76 - This function starts the logging functionality of Scrapy using the settings given by the CLI. 77 - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 78 - """ 79 - if docopt_arguments["--log"] is not None: 80 - if docopt_arguments["--verbose"]: 81 - log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) 82 - else: 83 - log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING) 84 - else: 85 - if docopt_arguments["--verbose"]: 86 - log.start(logstdout=False, loglevel=log.DEBUG) 87 - else: 88 - log.start(logstdout=True, loglevel=log.WARNING) 89 - 90 - 91 54 def search(docopt_arguments, source_loader): 92 55 """ 93 56 The function that facilitates the search for a specific compound. 94 57 :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. 95 58 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. 96 59 """ 97 - start_log(docopt_arguments) 98 - settings = scrapy_settings_manipulation(docopt_arguments) 99 - setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(',')) 60 + conf = Configurator() 61 + conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) 62 + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) 63 + setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) 100 64 reactor.run() 101 65 102 66 103 67 # The start for the Fourmi Command Line interface. 104 68 if __name__ == '__main__': 105 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.2') 69 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') 106 70 loader = SourceLoader() 107 71 108 72 if arguments["--include"]:
+10 -5
sourceloader.py utils/sourceloader.py
··· 3 3 import re 4 4 5 5 from FourmiCrawler.sources.source import Source 6 - 6 + from utils.configurator import Configurator 7 7 8 8 class SourceLoader: 9 9 sources = [] 10 10 11 - def __init__(self, rel_dir="FourmiCrawler/sources"): 11 + def __init__(self, rel_dir="../FourmiCrawler/sources"): 12 12 """ 13 13 The initiation of a SourceLoader, selects and indexes a directory for usable sources. 14 + Also loads a configuration file for Sources and passes the arguments in 15 + the named section to the source 14 16 :param rel_dir: A relative path to a directory. 15 17 """ 16 18 path = os.path.dirname(os.path.abspath(__file__)) 17 19 path += "/" + rel_dir 18 20 known_parser = set() 19 21 22 + config = Configurator.read_sourceconfiguration() 23 + 20 24 for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: 21 - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) 25 + mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) 22 26 classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] 23 27 for cls in classes: 24 28 if issubclass(cls, Source) and cls not in known_parser: 25 - self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? 29 + sourcecfg = Configurator.get_section(config, cls.__name__) 30 + self.sources.append(cls(sourcecfg)) 26 31 known_parser.add(cls) 27 32 28 33 def include(self, source_names): ··· 55 60 string += "Source: " + src.__class__.__name__ 56 61 string += " - " 57 62 string += "URI: " + src.website + "\n" 58 - return string 63 + return string
+50
tests/test_configurator.py
··· 1 + import unittest 2 + from utils.configurator import Configurator 3 + 4 + import ConfigParser 5 + 6 + class TestConfigurator(unittest.TestCase): 7 + 8 + def setUp(self): 9 + self.conf = Configurator() 10 + 11 + def test_set_output(self): 12 + self.conf.set_output(filename="test.txt", fileformat="csv") 13 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") 14 + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 15 + 16 + self.conf.set_output("results.*format*", "jsonlines") 17 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") 18 + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") 19 + 20 + self.conf.set_output("results.*format*", "csv") 21 + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") 22 + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 23 + 24 + # def test_start_log(self): 25 + # self.conf.start_log("test.log", True) 26 + # self.conf.start_log("test.log", False) 27 + # self.conf.start_log(None, True) 28 + # self.conf.start_log(None, False) 29 + 30 + def test_read_sourceconfiguration(self): 31 + config = self.conf.read_sourceconfiguration() 32 + self.assertIsInstance(config, ConfigParser.ConfigParser) 33 + 34 + def test_get_section(self): 35 + config = ConfigParser.ConfigParser() 36 + section = self.conf.get_section(config, 'test') 37 + self.assertIn('reliability', section) 38 + self.assertEquals(section['reliability'], '') 39 + 40 + config.set('DEFAULT', 'reliability', 'Low') 41 + 42 + section = self.conf.get_section(config, 'test') 43 + self.assertEquals(section['reliability'], 'Low') 44 + 45 + config.add_section('test') 46 + config.set('test', 'var', 'Maybe') 47 + 48 + section = self.conf.get_section(config, 'test') 49 + self.assertEquals(section['reliability'], 'Low') 50 + self.assertEqual(section['var'], 'Maybe')
+1 -1
tests/test_sourceloader.py
··· 1 1 import unittest 2 2 3 - from sourceloader import SourceLoader 3 + from utils.sourceloader import SourceLoader 4 4 5 5 6 6 class TestSourceloader(unittest.TestCase):
+4 -4
tests/test_spider.py
··· 3 3 from scrapy.http import Request 4 4 5 5 from FourmiCrawler import spider 6 - from FourmiCrawler.sources.ChemSpider import ChemSpider 6 + from FourmiCrawler.sources.NIST import NIST 7 7 from FourmiCrawler.sources.source import Source 8 8 9 9 ··· 41 41 self.spi.add_source(src) 42 42 self.assertEqual(self.spi.start_requests(), []) 43 43 44 - src2 = ChemSpider() 44 + src2 = NIST() 45 45 self.spi.add_source(src2) 46 46 requests = self.spi.start_requests() 47 47 self.assertGreater(len(requests), 0) ··· 57 57 self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) 58 58 self.assertIn("new_compound", self.spi.synonyms) 59 59 60 - src2 = ChemSpider() 60 + src2 = NIST() 61 61 self.spi.add_source(src2) 62 62 self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) 63 63 self.assertIn("other_compound", self.spi.synonyms) 64 - self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) 64 + self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
utils/__init__.py

This is a binary file and will not be displayed.

+81
utils/configurator.py
··· 1 + from scrapy import log 2 + from scrapy.utils.project import get_project_settings 3 + import ConfigParser 4 + 5 + class Configurator: 6 + """ 7 + A helper class in the fourmi class. This class is used to process the settings as set 8 + from one of the Fourmi applications. 9 + """ 10 + 11 + def __init__(self): 12 + self.scrapy_settings = get_project_settings() 13 + 14 + 15 + def set_output(self, filename, fileformat): 16 + """ 17 + This function manipulates the Scrapy output file settings that normally would be set in the settings file. 18 + In the Fourmi project these are command line arguments. 19 + :param filename: The filename of the file where the output will be put. 20 + :param fileformat: The format in which the output will be. 21 + """ 22 + 23 + if filename != 'results.*format*': 24 + self.scrapy_settings.overrides["FEED_URI"] = filename 25 + elif fileformat == "jsonlines": 26 + self.scrapy_settings.overrides["FEED_URI"] = "results.json" 27 + elif fileformat is not None: 28 + self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat 29 + 30 + if fileformat is not None: 31 + self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat 32 + 33 + 34 + def start_log(self, logfile, verbose): 35 + """ 36 + This function starts the logging functionality of Scrapy using the settings given by the CLI. 37 + :param logfile: The location where the logfile will be saved. 38 + :param verbose: A boolean value to switch between loglevels. 39 + """ 40 + if logfile is not None: 41 + if verbose: 42 + log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) 43 + else: 44 + log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) 45 + else: 46 + if verbose: 47 + log.start(logstdout=False, loglevel=log.DEBUG) 48 + else: 49 + log.start(logstdout=True, loglevel=log.WARNING) 50 + 51 + @staticmethod 52 + def read_sourceconfiguration(): 53 + """ 54 + This function reads sources.cfg in the main folder for configuration 55 + variables for sources 56 + :return a ConfigParser object of sources.cfg 57 + """ 58 + config = ConfigParser.ConfigParser() 59 + config.read('sources.cfg') # [TODO]: should be softcoded eventually 60 + return config 61 + 62 + @staticmethod 63 + def get_section(config, sourcename): 64 + """ 65 + This function reads a config section labeled in variable sourcename and 66 + tests whether the reliability variable is set else set to empty string. 67 + Return the default section if the labeled config section does not exist 68 + :param config: a ConfigParser object 69 + :param sourcename: the name of the section to be read 70 + :return a dictionary of the section in the config labeled in sourcename 71 + """ 72 + section = dict() 73 + if config.has_section(sourcename): 74 + section = dict(config.items(sourcename)) 75 + elif config.defaults(): 76 + section = config.defaults() 77 + if 'reliability' not in section: 78 + log.msg('Reliability not set for %s' % sourcename, 79 + level=log.WARNING) 80 + section['reliability'] = '' 81 + return section