A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'release/v0.5.1'

+278 -98
+3
Changelog.md
··· 1 + ### v0.5.1 2 + - UPDATED: Logging functionality from command line 3 + - DEV: Code cleanup and extra tests
+4 -5
FourmiCrawler/sources/ChemSpider.py
··· 26 26 structure = 'Chemical-Structure.%s.html' 27 27 extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 28 28 29 - def __init__(self, config={}): 29 + def __init__(self, config=None): 30 30 Source.__init__(self, config) 31 - self.cfg = config 32 31 self.ignore_list = [] 33 32 if 'token' not in self.cfg or self.cfg['token'] == '': 34 33 log.msg('ChemSpider token not set or empty, search/MassSpec API ' ··· 36 35 self.cfg['token'] = '' 37 36 self.search += self.cfg['token'] 38 37 self.extendedinfo += self.cfg['token'] 39 - 40 38 41 39 def parse(self, response): 42 40 sel = Selector(response) ··· 199 197 return properties 200 198 201 199 def newresult(self, attribute, value, conditions='', source='ChemSpider'): 202 - return Result({ 200 + return Result( 201 + { 203 202 'attribute': attribute, 204 203 'value': value, 205 204 'source': source, 206 205 'reliability': self.cfg['reliability'], 207 206 'conditions': conditions 208 - }) 207 + }) 209 208 210 209 def parse_searchrequest(self, response): 211 210 """Parse the initial response of the ChemSpider Search API """
+8 -11
FourmiCrawler/sources/NIST.py
··· 22 22 23 23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 24 25 - cfg = {} 26 - 27 - def __init__(self, config={}): 25 + def __init__(self, config=None): 28 26 Source.__init__(self, config) 29 27 self.ignore_list = set() 30 - self.cfg = config 31 28 32 29 def parse(self, response): 33 30 sel = Selector(response) ··· 88 85 InChiKey, CAS number 89 86 """ 90 87 ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') 91 - li = ul.xpath('li') 92 88 93 89 raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() 94 90 for synonym in raw_synonyms[0].strip().split(';\n'): ··· 255 251 return results 256 252 257 253 def newresult(self, attribute, value, conditions=''): 258 - return Result({ 259 - 'attribute': attribute, 260 - 'value': value, 261 - 'source': 'NIST', 262 - 'reliability': self.cfg['reliability'], 263 - 'conditions': conditions 254 + return Result( 255 + { 256 + 'attribute': attribute, 257 + 'value': value, 258 + 'source': 'NIST', 259 + 'reliability': self.cfg['reliability'], 260 + 'conditions': conditions 264 261 }) 265 262 266 263 def new_compound_request(self, compound):
+83 -39
FourmiCrawler/sources/WikipediaParser.py
··· 12 12 """ Wikipedia scraper for chemical properties 13 13 14 14 This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. 15 - It also returns requests with other external sources which contain information on parsed subject. 15 + It also returns requests with other external sources which contain information on parsed subject. 16 16 """ 17 17 18 18 website = "http://en.wikipedia.org/wiki/*" 19 19 __spider = None 20 20 searched_compounds = [] 21 21 22 - cfg = {} 23 - 24 - def __init__(self, config={}): 22 + def __init__(self, config=None): 25 23 Source.__init__(self, config) 26 - self.cfg = config 27 24 28 25 def parse(self, response): 29 - """ Distributes the above described behaviour """ 26 + """ 27 + Distributes the above described behaviour 28 + :param response: The incoming search request 29 + :return: Returns the found properties if response is unique or returns none if it's already known 30 + """ 30 31 log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 31 32 sel = Selector(response) 32 33 compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page ··· 38 39 return items 39 40 40 41 def parse_infobox(self, sel): 41 - """ scrape data from infobox on wikipedia. """ 42 + """ 43 + Scrape data from infobox on wikipedia. 44 + 45 + Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and 46 + :param sel: The selector with the html-information of the page to parse 47 + :return: item_list: Returns a list of properties with their values, source, etc.. 48 + """ 49 + 42 50 items = [] 43 51 44 - # be sure to get chembox (wikipedia template) 52 + # scrape the chembox (wikipedia template) 53 + items = self.parse_chembox(sel, items) 54 + 55 + # scrape the drugbox (wikipedia template) 56 + items = self.parse_drugbox(sel, items) 57 + 58 + items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 59 + item_list = self.clean_items(items) 60 + 61 + identifiers = self.get_identifiers(sel) 62 + 63 + #add extra sources to scrape from as requests 64 + for i, identifier in enumerate(identifiers): 65 + request = None 66 + #discard internal wikipedia links 67 + if re.match('//en\.wikipedia', identifier): 68 + log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) 69 + #fix links starting with '//www.' 70 + elif re.match('/{2}', identifier): 71 + identifier = re.sub("/{2}", "http://", identifier) 72 + request = Request(identifier) 73 + else: 74 + request = Request(identifier) 75 + log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG) 76 + item_list.append(request) 77 + 78 + return item_list 79 + 80 + def parse_chembox(self, sel, items): 81 + """ 82 + Scrape data from chembox infobox on wikipedia. 83 + 84 + :param sel: The selector with the html-information of the page to parse 85 + :param items: the list of items where the result have to be stored in 86 + :return: items: the list of items with the new found and stored items 87 + """ 45 88 tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ 46 89 xpath('normalize-space(string())') 47 90 prop_names = tr_list[::2] ··· 53 96 ) 54 97 items.append(item) 55 98 log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) 99 + return items 56 100 57 - #scrape the drugbox (wikipedia template) 101 + def parse_drugbox(self, sel, items): 102 + """ 103 + Scrape data from drugbox infobox on wikipedia. 104 + 105 + :param sel: The selector with the html-information of the page to parse 106 + :param items: the list of items where the result have to be stored in 107 + :return: items: the list of items with the new found and stored items 108 + """ 58 109 tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') 59 110 log.msg('dit: %s' % tr_list2, level=log.DEBUG) 60 111 for tablerow in tr_list2: ··· 69 120 log.msg( 70 121 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 71 122 level=log.DEBUG) 72 - 73 - items = filter(lambda a: a['value'] != '', items) # remove items with an empty value 74 - item_list = self.clean_items(items) 75 - 76 - identifiers = self.get_identifiers(sel) 77 - 78 - #add extra sources to scrape from as requests 79 - for i, identifier in enumerate(identifiers): 80 - request = None 81 - #discard internal wikipedia links 82 - if re.match('//en\.wikipedia', identifier): 83 - log.msg('Found link to Wikipedia, this is not something to scrape: %s' % identifier, level=log.WARNING) 84 - #fix links starting with '//www.' 85 - elif re.match('/{2}', identifier): 86 - identifier = re.sub("/{2}", "http://", identifier) 87 - request = Request(identifier) 88 - else: 89 - request = Request(identifier) 90 - log.msg('New identifier found, request: %s' % identifier, level=log.DEBUG) 91 - item_list.append(request) 92 - 93 - return item_list 123 + return items 94 124 95 125 def new_compound_request(self, compound): 96 126 return Request(url=self.website[:-1] + compound, callback=self.parse) 97 127 98 128 @staticmethod 99 129 def clean_items(items): 100 - """ clean up properties using regex, makes it possible to split the values from the units """ 130 + 131 + """ 132 + Clean up properties using regex, makes it possible to split the values from the units 133 + 134 + Almost not in use, only cleans J/K/mol values and boiling/melting points. 135 + 136 + :param items: List of properties with their values, source, etc.. 137 + :return: items: List of now cleaned up items 138 + """ 101 139 for item in items: 102 140 value = item['value'] 103 141 m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) ··· 110 148 111 149 @staticmethod 112 150 def get_identifiers(sel): 113 - """ find external links, named 'Identifiers' to different sources. """ 151 + """ 152 + Find external links, named 'Identifiers' to different sources. 153 + 154 + :param sel: The selector with the html-information of the page to parse 155 + :return: links: New links which can be used to expand the crawlers search 156 + """ 114 157 links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' 115 158 '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() 116 159 return links 117 160 118 161 def newresult(self, attribute, value): 119 - return Result({ 120 - 'attribute': attribute, 121 - 'value': value, 122 - 'source': 'Wikipedia', 123 - 'reliability': self.cfg['reliability'], 124 - 'conditions': '' 162 + return Result( 163 + { 164 + 'attribute': attribute, 165 + 'value': value, 166 + 'source': 'Wikipedia', 167 + 'reliability': self.cfg['reliability'], 168 + 'conditions': '' 125 169 })
+4 -1
FourmiCrawler/sources/source.py
··· 6 6 website = "http://something/*" # Regex of URI's the source is able to parse 7 7 _spider = None 8 8 9 - def __init__(self, config={}): 9 + def __init__(self, config=None): 10 10 """ 11 11 Initiation of a new Source 12 12 """ 13 + self.cfg = {} 14 + if config is not None: 15 + self.cfg = config 13 16 pass 14 17 15 18 def parse(self, response):
+5 -2
FourmiCrawler/spider.py
··· 10 10 """ 11 11 name = "FourmiSpider" 12 12 13 - def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): 13 + def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): 14 14 """ 15 15 Initiation of the Spider 16 16 :param compound: compound that will be searched. ··· 20 20 self.synonyms = set() 21 21 super(FourmiSpider, self).__init__(*args, **kwargs) 22 22 self.synonyms.add(compound) 23 - self.selected_attributes = selected_attributes 23 + if selected_attributes is None: 24 + self.selected_attributes = [".*"] 25 + else: 26 + self.selected_attributes = selected_attributes 24 27 25 28 def parse(self, response): 26 29 """
+5 -9
README.md
··· 23 23 24 24 ### Installing 25 25 26 - If you're installing Fourmi, please take a look at our [installation guide](...) 27 - on our wiki. When you've installed the application, make sure to check our 28 - [usage guide](...). 26 + If you're installing Fourmi, please take a look at our installation guides 27 + on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our 28 + usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI). 29 29 30 30 ### Using the Source 31 31 32 32 To use the Fourmi source code multiple dependencies are required. Take a look at 33 - the [wiki page](...) on using the application source code for a step by step 33 + our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step 34 34 installation guide. 35 35 36 36 When developing for the Fourmi project keep in mind that code readability is a 37 37 must. To maintain the readability, code should be conform with the 38 38 [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python 39 39 code. More information about the different structures and principles of the 40 - Fourmi application can be found on our [wiki](...). 40 + Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki). 41 41 42 42 ### To Do 43 43 ··· 45 45 46 46 __Main goals:__ 47 47 48 - - Improve our documentation and guides. (Assignee: Dekker) 49 48 - Build an graphical user interface(GUI) as alternative for the command line 50 49 interface(CLI). (Assignee: Harmen) 51 50 - Compiling the source into an windows executable. (Assignee: Bas) 52 - - Create an configuration file to hold logins and API keys. 53 - - Determine reliability of our data point. 54 - - Create an module to gather data from NIST. (Assignee: Rob) 55 51 - Create an module to gather data from PubChem. (Assignee: Nout) 56 52 57 53 __Side goals:__
+103
SIGNED.md
··· 1 + ##### Signed by https://keybase.io/jdekker 2 + ``` 3 + -----BEGIN PGP SIGNATURE----- 4 + Version: GnuPG v1.4.11 (GNU/Linux) 5 + 6 + iQIcBAABAgAGBQJTnfAAAAoJEJrQ9RIUCT6/KZIQAME07yzAG5hnqsQof5ESoeQs 7 + 5wBxAhiBIX/0yn3qIT/eMh0ubCKUZsqJ3/PzUljeMJ6CGtwxFYfTWkgjYlOoAz9G 8 + fS7CjPmRPyiu+MFo5he+oVRmLUMqfuLUrCyuIxJwMXq5YbQvzyqiffvxr8VRULtV 9 + 3c0drWfQMX1ZeAWSIYN0xuMndzvaqIAQU6o4tSQf/rUiKlM2NnTDNUHu2PY9FED/ 10 + IJwM/IgAMAkJARyL7ltq6pHzORsu7sd2Nhv0esa0Gs2GSuRjKueeMZvJzpDAufy9 11 + bWn9EqKhVwPR6zWnXRmNj9Ymj1w167hIUYcBdFhC7kie5zv9+pDE6d/s7pw/Rejd 12 + L0k8LKBGtJ8o7SKYR9kcNLDWXEnHjfCraD+14FMYqQPcz2ekoV6Exv/mP8qRPwUc 13 + b+FtjJtW8fEiOMAyjMOvLTzYbCVwjdErAqgNdHeSByi1nxfrphjajRiNUt7fVimJ 14 + ++QZzKCj6xN2MuTJ41KbZ8teiUXwQB4OKKij0fgoy0RBwW0vqH6MF7cCKm1zT1Qa 15 + 9FGlBU2jSybQqUu4lJ/eUjO/3tQMhJErQJU/i+6lwi7OMnS9J/g17Heghp5Hxyhc 16 + VWvhR56pbWLIL2XQqDGGEqPDIzXohHnbRJ1N71b06akIvIIrTqc6Glu4PJeUG/Pe 17 + EF8/jBwydxbKUOyKRSQS 18 + =xWbc 19 + -----END PGP SIGNATURE----- 20 + 21 + ``` 22 + 23 + <!-- END SIGNATURES --> 24 + 25 + ### Begin signed statement 26 + 27 + #### Expect 28 + 29 + ``` 30 + size exec file contents 31 + ./ 32 + 17591 .coverage 1dd1207846db74e407d3a4a1951b8e81934a4693385d39f6c337a224375bad39|1b7ead09cf213b5a9545557be982aaa30238b689bb54adf604f82b12ef521eb2 33 + 375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1 34 + 464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c 35 + 97 Changelog.md bcbce9a33bbbbcd18fd7788e6dc3a9c4b13dff7128ea99968994c1b290ddc931 36 + FourmiCrawler/ 37 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 38 + 304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 39 + 2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 40 + 716 settings.py 37a8f63e123bccc77076d574617a522b30c1d7c5e893ec3d78cc40e1563dd8a6 41 + sources/ 42 + 9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d 43 + 9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644 44 + 6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97 45 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 46 + 1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4 47 + 3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a 48 + 1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c 49 + 3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3 50 + 3659 x fourmi.py 81781ed7299e447e6fc551fba69e62cd7a1d63f27dfa063927f4c5c10f5ac331 51 + 200850 log.txt d76e741f9e7b67c2574e9cdbbe499ea4861f6e0bd11e5962fdaf9d8720effef8 52 + 184692 results.csv 31132f7f394babeb5dfd249aaa714756017b2e1b314b6715f57e6ad9524e5be8|d0bb724f6d714ec7a4a1ad2052f70dd4510b5ac08d616e24b5e9a903dedab586 53 + 261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 54 + tests/ 55 + 1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b 56 + 2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80 57 + 1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 58 + 1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 59 + 2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 60 + utils/ 61 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 62 + 3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921 63 + 2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 64 + ``` 65 + 66 + #### Ignore 67 + 68 + ``` 69 + /SIGNED.md 70 + ``` 71 + 72 + #### Presets 73 + 74 + ``` 75 + git # ignore .git and anything as described by .gitignore files 76 + dropbox # ignore .dropbox-cache and other Dropbox-related files 77 + kb # ignore anything as described by .kbignore files 78 + ``` 79 + 80 + <!-- summarize version = 0.0.9 --> 81 + 82 + ### End signed statement 83 + 84 + <hr> 85 + 86 + #### Notes 87 + 88 + With keybase you can sign any directory's contents, whether it's a git repo, 89 + source code distribution, or a personal documents folder. It aims to replace the drudgery of: 90 + 91 + 1. comparing a zipped file to a detached statement 92 + 2. downloading a public key 93 + 3. confirming it is in fact the author's by reviewing public statements they've made, using it 94 + 95 + All in one simple command: 96 + 97 + ```bash 98 + keybase dir verify 99 + ``` 100 + 101 + There are lots of options, including assertions for automating your checks. 102 + 103 + For more info, check out https://keybase.io/docs/command_line/code_signing
+9 -6
fourmi.py
··· 5 5 Usage: 6 6 fourmi search <compound> 7 7 fourmi [options] search <compound> 8 + fourmi [-v | -vv | -vvv] [options] search <compound> 8 9 fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 9 10 fourmi list 10 11 fourmi [--include=<sourcename> | --exclude=<sourcename>] list ··· 15 16 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] 16 17 -h --help Show this screen. 17 18 --version Show version. 18 - --verbose Verbose logging output. 19 + -v Verbose logging output. (Multiple occurrences increase logging level) 19 20 --log=<file> Save log to an file. 20 21 -o <file> --output=<file> Output file [default: results.*format*] 21 22 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] ··· 25 26 26 27 from twisted.internet import reactor 27 28 from scrapy.crawler import Crawler 28 - from scrapy import log, signals 29 - from scrapy.utils.project import get_project_settings 29 + from scrapy import signals, log 30 30 import docopt 31 31 32 32 from FourmiCrawler.spider import FourmiSpider ··· 58 58 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. 59 59 """ 60 60 conf = Configurator() 61 - conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) 61 + conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) 62 62 conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) 63 - setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) 63 + setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, 64 + source_loader, docopt_arguments["--attributes"].split(',')) 65 + log.start(conf.scrapy_settings.get("LOG_FILE"), 66 + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 64 67 reactor.run() 65 68 66 69 67 70 # The start for the Fourmi Command Line interface. 68 71 if __name__ == '__main__': 69 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') 72 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.1') 70 73 loader = SourceLoader() 71 74 72 75 if arguments["--include"]:
+24 -6
tests/test_configurator.py
··· 1 1 import unittest 2 + import ConfigParser 3 + 2 4 from utils.configurator import Configurator 3 5 4 - import ConfigParser 5 6 6 7 class TestConfigurator(unittest.TestCase): 7 8 ··· 21 22 self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") 22 23 self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 23 24 24 - # def test_start_log(self): 25 - # self.conf.start_log("test.log", True) 26 - # self.conf.start_log("test.log", False) 27 - # self.conf.start_log(None, True) 28 - # self.conf.start_log(None, False) 25 + def test_start_log(self): 26 + for i in range(0, 3): 27 + self.conf.set_logging("TEST", i) 28 + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST") 29 + if i > 0: 30 + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True) 31 + if i > 1: 32 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False) 33 + else: 34 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 35 + else: 36 + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False) 37 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 38 + if i == 1: 39 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING") 40 + elif i == 2: 41 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO") 42 + elif i == 3: 43 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG") 44 + 45 + self.conf.set_logging(verbose=i) 46 + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None) 29 47 30 48 def test_read_sourceconfiguration(self): 31 49 config = self.conf.read_sourceconfiguration()
+1
tests/test_pipeline.py
··· 13 13 def test_none_pipeline(self): 14 14 # Testing the pipeline that replaces the None values in items. 15 15 self.testItem["value"] = "abc" 16 + self.testItem["source"] = None 16 17 pipe = pipelines.RemoveNonePipeline() 17 18 processed = pipe.process_item(self.testItem, spider.FourmiSpider()) 18 19
-1
tests/test_spider.py
··· 47 47 self.assertGreater(len(requests), 0) 48 48 self.assertIsInstance(requests[0], Request) 49 49 50 - 51 50 def test_synonym_requests(self): 52 51 # A test for the synonym request function 53 52 self.spi._sources = []
+28 -18
utils/configurator.py
··· 1 - from scrapy import log 1 + import ConfigParser 2 + 2 3 from scrapy.utils.project import get_project_settings 3 - import ConfigParser 4 + 4 5 5 6 class Configurator: 6 7 """ ··· 10 11 11 12 def __init__(self): 12 13 self.scrapy_settings = get_project_settings() 13 - 14 14 15 15 def set_output(self, filename, fileformat): 16 16 """ ··· 30 30 if fileformat is not None: 31 31 self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat 32 32 33 - 34 - def start_log(self, logfile, verbose): 33 + def set_logging(self, logfile=None, verbose=0): 35 34 """ 36 - This function starts the logging functionality of Scrapy using the settings given by the CLI. 35 + This function changes the default settings of Scapy's logging functionality 36 + using the settings given by the CLI. 37 37 :param logfile: The location where the logfile will be saved. 38 - :param verbose: A boolean value to switch between loglevels. 38 + :param verbose: A integer value to switch between loglevels. 39 39 """ 40 + if verbose != 0: 41 + self.scrapy_settings.overrides["LOG_ENABLED"] = True 42 + else: 43 + self.scrapy_settings.overrides["LOG_ENABLED"] = False 44 + 45 + if verbose == 1: 46 + self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING" 47 + elif verbose == 2: 48 + self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO" 49 + else: 50 + self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG" 51 + 52 + if verbose > 1: 53 + self.scrapy_settings.overrides["LOG_STDOUT"] = False 54 + else: 55 + self.scrapy_settings.overrides["LOG_STDOUT"] = True 56 + 40 57 if logfile is not None: 41 - if verbose: 42 - log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) 43 - else: 44 - log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) 58 + self.scrapy_settings.overrides["LOG_FILE"] = logfile 45 59 else: 46 - if verbose: 47 - log.start(logstdout=False, loglevel=log.DEBUG) 48 - else: 49 - log.start(logstdout=True, loglevel=log.WARNING) 60 + self.scrapy_settings.overrides["LOG_FILE"] = None 50 61 51 62 @staticmethod 52 63 def read_sourceconfiguration(): ··· 56 67 :return a ConfigParser object of sources.cfg 57 68 """ 58 69 config = ConfigParser.ConfigParser() 59 - config.read('sources.cfg') # [TODO]: should be softcoded eventually 70 + config.read('sources.cfg') # [TODO]: should be softcoded eventually 60 71 return config 61 72 62 73 @staticmethod ··· 75 86 elif config.defaults(): 76 87 section = config.defaults() 77 88 if 'reliability' not in section: 78 - log.msg('Reliability not set for %s' % sourcename, 79 - level=log.WARNING) 89 + print 'WARNING: Reliability not set for %s' % sourcename 80 90 section['reliability'] = '' 81 91 return section
+1
utils/sourceloader.py
··· 5 5 from FourmiCrawler.sources.source import Source 6 6 from utils.configurator import Configurator 7 7 8 + 8 9 class SourceLoader: 9 10 sources = [] 10 11