tangled
alpha
login
or
join now
dekker.one
/
Fourmi
0
fork
atom
A web scraper build to search specific information for a given compound (and its pseudonyms)
0
fork
atom
overview
issues
pulls
pipelines
added scraping for generic info except for synonyms
Rob tB
12 years ago
98f58ea4
50c79e3b
+37
1 changed file
expand all
collapse all
unified
split
FourmiCrawler
sources
NIST.py
+37
FourmiCrawler/sources/NIST.py
···
18
18
19
19
requests = []
20
20
21
21
+
requests.extend(self.parse_generic_info(sel))
22
22
+
21
23
symbol_table = {}
22
24
tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
23
25
for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
···
58
60
else:
59
61
log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
60
62
continue #Assume unsupported
63
63
+
return requests
64
64
+
65
65
+
def parse_generic_info(self, sel):
66
66
+
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
67
67
+
li = ul.xpath('li')
68
68
+
69
69
+
data = {}
70
70
+
71
71
+
raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
72
72
+
data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
73
73
+
74
74
+
raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
75
75
+
data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
76
76
+
77
77
+
raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
78
78
+
data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
79
79
+
80
80
+
raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
81
81
+
'/tt/text()')
82
82
+
data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
83
83
+
84
84
+
raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
85
85
+
data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
86
86
+
87
87
+
requests = []
88
88
+
for key, value in data.iteritems():
89
89
+
result = Result({
90
90
+
'attribute': key,
91
91
+
'value': value,
92
92
+
'source': 'NIST',
93
93
+
'reliability': 'Unknown',
94
94
+
'conditions': ''
95
95
+
})
96
96
+
requests.append(result)
97
97
+
61
98
return requests
62
99
63
100
def parse_aggregate_data(self, table, symbol_table):