A web scraper build to search specific information for a given compound (and its pseudonyms)

Merge branch 'develop' into feature/GUI

+330 -75
+12
Changelog.md
··· 1 + ### v0.5.3 2 + - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options 3 + - FIX: Logging is now "actually" disabled if not using the verbose option. 4 + - FEATURE: Added support for PubChem 5 + 6 + ### v0.5.2 7 + - FIX: Signatured used to contain untracked and older files, current signature 8 + should be correct. 9 + 10 + ### v0.5.1 11 + - UPDATED: Logging functionality from command line 12 + - DEV: Code cleanup and extra tests
+3 -1
FourmiCrawler/settings.py
··· 18 18 FEED_URI = 'results.json' 19 19 FEED_FORMAT = 'jsonlines' 20 20 21 - 22 21 # Crawl responsibly by identifying yourself (and your website) on the 23 22 # user-agent 24 23 24 + # [todo] - Check for repercussions on spoofing the user agent 25 + 25 26 # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' 27 + USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
+4 -5
FourmiCrawler/sources/ChemSpider.py
··· 26 26 structure = 'Chemical-Structure.%s.html' 27 27 extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' 28 28 29 - def __init__(self, config={}): 29 + def __init__(self, config=None): 30 30 Source.__init__(self, config) 31 - self.cfg = config 32 31 self.ignore_list = [] 33 32 if 'token' not in self.cfg or self.cfg['token'] == '': 34 33 log.msg('ChemSpider token not set or empty, search/MassSpec API ' ··· 36 35 self.cfg['token'] = '' 37 36 self.search += self.cfg['token'] 38 37 self.extendedinfo += self.cfg['token'] 39 - 40 38 41 39 def parse(self, response): 42 40 sel = Selector(response) ··· 199 197 return properties 200 198 201 199 def newresult(self, attribute, value, conditions='', source='ChemSpider'): 202 - return Result({ 200 + return Result( 201 + { 203 202 'attribute': attribute, 204 203 'value': value, 205 204 'source': source, 206 205 'reliability': self.cfg['reliability'], 207 206 'conditions': conditions 208 - }) 207 + }) 209 208 210 209 def parse_searchrequest(self, response): 211 210 """Parse the initial response of the ChemSpider Search API """
+8 -11
FourmiCrawler/sources/NIST.py
··· 22 22 23 23 search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' 24 24 25 - cfg = {} 26 - 27 - def __init__(self, config={}): 25 + def __init__(self, config=None): 28 26 Source.__init__(self, config) 29 27 self.ignore_list = set() 30 - self.cfg = config 31 28 32 29 def parse(self, response): 33 30 sel = Selector(response) ··· 88 85 InChiKey, CAS number 89 86 """ 90 87 ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') 91 - li = ul.xpath('li') 92 88 93 89 raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() 94 90 for synonym in raw_synonyms[0].strip().split(';\n'): ··· 255 251 return results 256 252 257 253 def newresult(self, attribute, value, conditions=''): 258 - return Result({ 259 - 'attribute': attribute, 260 - 'value': value, 261 - 'source': 'NIST', 262 - 'reliability': self.cfg['reliability'], 263 - 'conditions': conditions 254 + return Result( 255 + { 256 + 'attribute': attribute, 257 + 'value': value, 258 + 'source': 'NIST', 259 + 'reliability': self.cfg['reliability'], 260 + 'conditions': conditions 264 261 }) 265 262 266 263 def new_compound_request(self, compound):
+111
FourmiCrawler/sources/PubChem.py
··· 1 + from scrapy.http import Request 2 + from scrapy import log 3 + from source import Source 4 + from scrapy.selector import Selector 5 + from FourmiCrawler.items import Result 6 + import re 7 + 8 + 9 + class PubChem(Source): 10 + """ PubChem scraper for chemical properties 11 + 12 + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, 13 + including sources of the values of properties. 14 + """ 15 + 16 + #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used 17 + website = 'https://*.ncbi.nlm.nih.gov/*' 18 + website_www = 'https://www.ncbi.nlm.nih.gov/*' 19 + website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' 20 + search = 'pccompound?term=%s' 21 + data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' 22 + 23 + __spider = None 24 + searched_compounds = set() 25 + 26 + def __init__(self, config): 27 + Source.__init__(self, config) 28 + self.cfg = config 29 + 30 + def parse(self, response): 31 + """ 32 + Distributes the above described behaviour 33 + :param response: The incoming search request 34 + :return Returns the found properties if response is unique or returns none if it's already known 35 + """ 36 + requests = [] 37 + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) 38 + 39 + sel = Selector(response) 40 + compound = sel.xpath('//h1/text()').extract()[0] 41 + if compound in self.searched_compounds: 42 + return None 43 + 44 + self.searched_compounds.update(compound) 45 + raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] 46 + for synonym in raw_synonyms.strip().split(', '): 47 + log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) 48 + self.searched_compounds.update(synonym) 49 + self._spider.get_synonym_requests(synonym) 50 + log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) 51 + 52 + n = re.search(r'cid=(\d+)',response.url) 53 + if n: 54 + cid = n.group(1) 55 + log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach 56 + # the seperate html page which contains the properties and their values 57 + 58 + #using this cid to get the right url and scrape it 59 + requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) 60 + return requests 61 + 62 + def parse_data(self, response): 63 + """ 64 + Parse data found in 'Chemical and Physical properties' part of a substance page. 65 + :param response: The response with the page to parse 66 + :return: requests: Returns a list of properties with their values, source, etc. 67 + """ 68 + log.msg('parsing data', level=log.DEBUG) 69 + requests = [] 70 + 71 + sel = Selector(response) 72 + props = sel.xpath('//div') 73 + 74 + for prop in props: 75 + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing 76 + if prop.xpath('a'): # parsing for single value in property 77 + prop_source = ''.join(prop.xpath('a/@title').extract()) 78 + prop_value = ''.join(prop.xpath('a/text()').extract()) 79 + new_prop = Result({ 80 + 'attribute': prop_name, 81 + 'value': prop_value, 82 + 'source': prop_source, 83 + 'reliability': 'Unknown', 84 + 'conditions': '' 85 + }) 86 + log.msg('PubChem prop: |%s| |%s| |%s|' % 87 + (new_prop['attribute'], new_prop['value'], 88 + new_prop['source']), level=log.DEBUG) 89 + requests.append(new_prop) 90 + elif prop.xpath('ul'): # parsing for multiple values (list) in property 91 + prop_values = prop.xpath('ul//li') 92 + for prop_li in prop_values: 93 + prop_value = ''.join(prop_li.xpath('a/text()').extract()) 94 + prop_source = ''.join(prop_li.xpath('a/@title').extract()) 95 + new_prop = Result({ 96 + 'attribute': prop_name, 97 + 'value': prop_value, 98 + 'source': prop_source, 99 + 'reliability': 'Unknown', 100 + 'conditions': '' 101 + }) 102 + log.msg('PubChem prop: |%s| |%s| |%s|' % 103 + (new_prop['attribute'], new_prop['value'], 104 + new_prop['source']), level=log.DEBUG) 105 + requests.append(new_prop) 106 + 107 + return requests 108 + 109 + 110 + def new_compound_request(self, compound): 111 + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
+13 -14
FourmiCrawler/sources/WikipediaParser.py
··· 1 + import re 2 + 1 3 from scrapy.http import Request 2 4 from scrapy import log 3 - from source import Source 4 5 from scrapy.selector import Selector 6 + 7 + from source import Source 5 8 from FourmiCrawler.items import Result 6 - import re 7 9 8 10 9 11 class WikipediaParser(Source): ··· 17 19 __spider = None 18 20 searched_compounds = [] 19 21 20 - cfg = {} 21 - 22 - def __init__(self, config={}): 22 + def __init__(self, config=None): 23 23 Source.__init__(self, config) 24 - self.cfg = config 25 24 26 25 def parse(self, response): 27 26 """ ··· 53 52 # scrape the chembox (wikipedia template) 54 53 items = self.parse_chembox(sel, items) 55 54 56 - #scrape the drugbox (wikipedia template) 55 + # scrape the drugbox (wikipedia template) 57 56 items = self.parse_drugbox(sel, items) 58 57 59 58 items = filter(lambda a: a['value'] != '', items) # remove items with an empty value ··· 122 121 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 123 122 level=log.DEBUG) 124 123 return items 125 - 126 124 127 125 def new_compound_request(self, compound): 128 126 return Request(url=self.website[:-1] + compound, callback=self.parse) ··· 161 159 return links 162 160 163 161 def newresult(self, attribute, value): 164 - return Result({ 165 - 'attribute': attribute, 166 - 'value': value, 167 - 'source': 'Wikipedia', 168 - 'reliability': self.cfg['reliability'], 169 - 'conditions': '' 162 + return Result( 163 + { 164 + 'attribute': attribute, 165 + 'value': value, 166 + 'source': 'Wikipedia', 167 + 'reliability': self.cfg['reliability'], 168 + 'conditions': '' 170 169 })
+4 -1
FourmiCrawler/sources/source.py
··· 6 6 website = "http://something/*" # Regex of URI's the source is able to parse 7 7 _spider = None 8 8 9 - def __init__(self, config={}): 9 + def __init__(self, config=None): 10 10 """ 11 11 Initiation of a new Source 12 12 """ 13 + self.cfg = {} 14 + if config is not None: 15 + self.cfg = config 13 16 pass 14 17 15 18 def parse(self, response):
+5 -2
FourmiCrawler/spider.py
··· 10 10 """ 11 11 name = "FourmiSpider" 12 12 13 - def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): 13 + def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): 14 14 """ 15 15 Initiation of the Spider 16 16 :param compound: compound that will be searched. ··· 20 20 self.synonyms = set() 21 21 super(FourmiSpider, self).__init__(*args, **kwargs) 22 22 self.synonyms.add(compound) 23 - self.selected_attributes = selected_attributes 23 + if selected_attributes is None: 24 + self.selected_attributes = [".*"] 25 + else: 26 + self.selected_attributes = selected_attributes 24 27 25 28 def parse(self, response): 26 29 """
+5 -9
README.md
··· 23 23 24 24 ### Installing 25 25 26 - If you're installing Fourmi, please take a look at our [installation guide](...) 27 - on our wiki. When you've installed the application, make sure to check our 28 - [usage guide](...). 26 + If you're installing Fourmi, please take a look at our installation guides 27 + on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our 28 + usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI). 29 29 30 30 ### Using the Source 31 31 32 32 To use the Fourmi source code multiple dependencies are required. Take a look at 33 - the [wiki page](...) on using the application source code for a step by step 33 + our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step 34 34 installation guide. 35 35 36 36 When developing for the Fourmi project keep in mind that code readability is a 37 37 must. To maintain the readability, code should be conform with the 38 38 [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python 39 39 code. More information about the different structures and principles of the 40 - Fourmi application can be found on our [wiki](...). 40 + Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki). 41 41 42 42 ### To Do 43 43 ··· 45 45 46 46 __Main goals:__ 47 47 48 - - Improve our documentation and guides. (Assignee: Dekker) 49 48 - Build an graphical user interface(GUI) as alternative for the command line 50 49 interface(CLI). (Assignee: Harmen) 51 50 - Compiling the source into an windows executable. (Assignee: Bas) 52 - - Create an configuration file to hold logins and API keys. 53 - - Determine reliability of our data point. 54 - - Create an module to gather data from NIST. (Assignee: Rob) 55 51 - Create an module to gather data from PubChem. (Assignee: Nout) 56 52 57 53 __Side goals:__
+101
SIGNED.md
··· 1 + ##### Signed by https://keybase.io/jdekker 2 + ``` 3 + -----BEGIN PGP SIGNATURE----- 4 + Version: GnuPG v1.4.11 (GNU/Linux) 5 + 6 + iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB 7 + MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9 8 + MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt 9 + gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC 10 + ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI 11 + VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi 12 + IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG 13 + 4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw 14 + UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f 15 + poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE 16 + 3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I 17 + ZbM/uDYyJdZbBe4UoCoF 18 + =AMhi 19 + -----END PGP SIGNATURE----- 20 + 21 + ``` 22 + 23 + <!-- END SIGNATURES --> 24 + 25 + ### Begin signed statement 26 + 27 + #### Expect 28 + 29 + ``` 30 + size exec file contents 31 + ./ 32 + 375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1 33 + 464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c 34 + 428 Changelog.md c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df 35 + FourmiCrawler/ 36 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 37 + 304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 38 + 2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 39 + 914 settings.py 0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2 40 + sources/ 41 + 9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d 42 + 9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644 43 + 4754 PubChem.py 58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb 44 + 6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97 45 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 46 + 1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4 47 + 3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a 48 + 1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c 49 + 3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3 50 + 3676 x fourmi.py 2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23 51 + 261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 52 + tests/ 53 + 1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b 54 + 2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80 55 + 1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 56 + 1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 57 + 2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 58 + utils/ 59 + 0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 60 + 3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921 61 + 2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 62 + ``` 63 + 64 + #### Ignore 65 + 66 + ``` 67 + /SIGNED.md 68 + ``` 69 + 70 + #### Presets 71 + 72 + ``` 73 + git # ignore .git and anything as described by .gitignore files 74 + dropbox # ignore .dropbox-cache and other Dropbox-related files 75 + kb # ignore anything as described by .kbignore files 76 + ``` 77 + 78 + <!-- summarize version = 0.0.9 --> 79 + 80 + ### End signed statement 81 + 82 + <hr> 83 + 84 + #### Notes 85 + 86 + With keybase you can sign any directory's contents, whether it's a git repo, 87 + source code distribution, or a personal documents folder. It aims to replace the drudgery of: 88 + 89 + 1. comparing a zipped file to a detached statement 90 + 2. downloading a public key 91 + 3. confirming it is in fact the author's by reviewing public statements they've made, using it 92 + 93 + All in one simple command: 94 + 95 + ```bash 96 + keybase dir verify 97 + ``` 98 + 99 + There are lots of options, including assertions for automating your checks. 100 + 101 + For more info, check out https://keybase.io/docs/command_line/code_signing
+10 -7
fourmi.py
··· 5 5 Usage: 6 6 fourmi search <compound> 7 7 fourmi [options] search <compound> 8 - fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 8 + fourmi [options] [-v | -vv | -vvv] [--include=<sourcename> | --exclude=<sourcename>] search <compound> 9 9 fourmi list 10 10 fourmi [--include=<sourcename> | --exclude=<sourcename>] list 11 11 fourmi -h | --help ··· 15 15 --attributes=<regex> Include only that match these regular expressions split by a comma. [default: .*] 16 16 -h --help Show this screen. 17 17 --version Show version. 18 - --verbose Verbose logging output. 18 + -v Verbose logging output. (Multiple occurrences increase logging level) 19 19 --log=<file> Save log to an file. 20 20 -o <file> --output=<file> Output file [default: results.*format*] 21 21 -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] ··· 25 25 26 26 from twisted.internet import reactor 27 27 from scrapy.crawler import Crawler 28 - from scrapy import log, signals 29 - from scrapy.utils.project import get_project_settings 28 + from scrapy import signals, log 30 29 import docopt 31 30 32 31 from FourmiCrawler.spider import FourmiSpider ··· 58 57 :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. 59 58 """ 60 59 conf = Configurator() 61 - conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) 60 + conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) 62 61 conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) 63 - setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) 62 + setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, 63 + source_loader, docopt_arguments["--attributes"].split(',')) 64 + if conf.scrapy_settings.getbool("LOG_ENABLED"): 65 + log.start(conf.scrapy_settings.get("LOG_FILE"), 66 + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) 64 67 reactor.run() 65 68 66 69 67 70 # The start for the Fourmi Command Line interface. 68 71 if __name__ == '__main__': 69 - arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') 72 + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3') 70 73 loader = SourceLoader() 71 74 72 75 if arguments["--include"]:
+24 -6
tests/test_configurator.py
··· 1 1 import unittest 2 + import ConfigParser 3 + 2 4 from utils.configurator import Configurator 3 5 4 - import ConfigParser 5 6 6 7 class TestConfigurator(unittest.TestCase): 7 8 ··· 21 22 self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") 22 23 self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") 23 24 24 - # def test_start_log(self): 25 - # self.conf.start_log("test.log", True) 26 - # self.conf.start_log("test.log", False) 27 - # self.conf.start_log(None, True) 28 - # self.conf.start_log(None, False) 25 + def test_start_log(self): 26 + for i in range(0, 3): 27 + self.conf.set_logging("TEST", i) 28 + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST") 29 + if i > 0: 30 + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True) 31 + if i > 1: 32 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False) 33 + else: 34 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 35 + else: 36 + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False) 37 + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) 38 + if i == 1: 39 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING") 40 + elif i == 2: 41 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO") 42 + elif i == 3: 43 + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG") 44 + 45 + self.conf.set_logging(verbose=i) 46 + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None) 29 47 30 48 def test_read_sourceconfiguration(self): 31 49 config = self.conf.read_sourceconfiguration()
+1
tests/test_pipeline.py
··· 13 13 def test_none_pipeline(self): 14 14 # Testing the pipeline that replaces the None values in items. 15 15 self.testItem["value"] = "abc" 16 + self.testItem["source"] = None 16 17 pipe = pipelines.RemoveNonePipeline() 17 18 processed = pipe.process_item(self.testItem, spider.FourmiSpider()) 18 19
-1
tests/test_spider.py
··· 47 47 self.assertGreater(len(requests), 0) 48 48 self.assertIsInstance(requests[0], Request) 49 49 50 - 51 50 def test_synonym_requests(self): 52 51 # A test for the synonym request function 53 52 self.spi._sources = []
+28 -18
utils/configurator.py
··· 1 - from scrapy import log 1 + import ConfigParser 2 + 2 3 from scrapy.utils.project import get_project_settings 3 - import ConfigParser 4 + 4 5 5 6 class Configurator: 6 7 """ ··· 10 11 11 12 def __init__(self): 12 13 self.scrapy_settings = get_project_settings() 13 - 14 14 15 15 def set_output(self, filename, fileformat): 16 16 """ ··· 30 30 if fileformat is not None: 31 31 self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat 32 32 33 - 34 - def start_log(self, logfile, verbose): 33 + def set_logging(self, logfile=None, verbose=0): 35 34 """ 36 - This function starts the logging functionality of Scrapy using the settings given by the CLI. 35 + This function changes the default settings of Scapy's logging functionality 36 + using the settings given by the CLI. 37 37 :param logfile: The location where the logfile will be saved. 38 - :param verbose: A boolean value to switch between loglevels. 38 + :param verbose: A integer value to switch between loglevels. 39 39 """ 40 + if verbose != 0: 41 + self.scrapy_settings.overrides["LOG_ENABLED"] = True 42 + else: 43 + self.scrapy_settings.overrides["LOG_ENABLED"] = False 44 + 45 + if verbose == 1: 46 + self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING" 47 + elif verbose == 2: 48 + self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO" 49 + else: 50 + self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG" 51 + 52 + if verbose > 1: 53 + self.scrapy_settings.overrides["LOG_STDOUT"] = False 54 + else: 55 + self.scrapy_settings.overrides["LOG_STDOUT"] = True 56 + 40 57 if logfile is not None: 41 - if verbose: 42 - log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) 43 - else: 44 - log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) 58 + self.scrapy_settings.overrides["LOG_FILE"] = logfile 45 59 else: 46 - if verbose: 47 - log.start(logstdout=False, loglevel=log.DEBUG) 48 - else: 49 - log.start(logstdout=True, loglevel=log.WARNING) 60 + self.scrapy_settings.overrides["LOG_FILE"] = None 50 61 51 62 @staticmethod 52 63 def read_sourceconfiguration(): ··· 56 67 :return a ConfigParser object of sources.cfg 57 68 """ 58 69 config = ConfigParser.ConfigParser() 59 - config.read('sources.cfg') # [TODO]: should be softcoded eventually 70 + config.read('sources.cfg') # [TODO]: should be softcoded eventually 60 71 return config 61 72 62 73 @staticmethod ··· 75 86 elif config.defaults(): 76 87 section = config.defaults() 77 88 if 'reliability' not in section: 78 - log.msg('Reliability not set for %s' % sourcename, 79 - level=log.WARNING) 89 + print 'WARNING: Reliability not set for %s' % sourcename 80 90 section['reliability'] = '' 81 91 return section
+1
utils/sourceloader.py
··· 5 5 from FourmiCrawler.sources.source import Source 6 6 from utils.configurator import Configurator 7 7 8 + 8 9 class SourceLoader: 9 10 sources = [] 10 11