tangled
alpha
login
or
join now
byarielm.fyi
/
pasturepy
1
fork
atom
pasturepy
is a Python tool for generating JSON feed definitions for use with
Graze
. Use it to programmatically create and customize feeds for Graze.
1
fork
atom
overview
issues
pulls
pipelines
feat: regex node from line-delimited files
byarielm.fyi
1 month ago
d7c77551
72c5ae58
+64
1 changed file
expand all
collapse all
unified
split
pasturepy
nodes
text.py
+64
pasturepy/nodes/text.py
···
1
1
+
from pathlib import Path
2
2
+
1
3
from pasturepy.constants.fields import OPTION_FIELDS, TEXT_FIELDS
2
4
from pasturepy.constants.graze_json import REGEX_METHODS, WORD_METHODS
3
5
···
41
43
TextNode._validate_field(field)
42
44
43
45
return filter_group.add_filter({method: [field, term, ignore_case]})
46
46
+
47
47
+
@staticmethod
48
48
+
def regex_from_files(
49
49
+
filter_group,
50
50
+
method: str,
51
51
+
field: str,
52
52
+
file_paths: str | list[str],
53
53
+
optimize: bool = False,
54
54
+
ignore_case: bool = True,
55
55
+
):
56
56
+
if method not in REGEX_METHODS:
57
57
+
raise ValueError(
58
58
+
f"Invalid method '{method}'. Must be one of {REGEX_METHODS}"
59
59
+
)
60
60
+
TextNode._validate_field(field)
61
61
+
62
62
+
paths = [file_paths] if isinstance(file_paths, str) else file_paths
63
63
+
64
64
+
all_terms = []
65
65
+
for file in paths:
66
66
+
terms = _read_term_file(file)
67
67
+
all_terms.extend(terms)
68
68
+
69
69
+
if not all_terms:
70
70
+
raise ValueError("No patterns found in term files")
71
71
+
72
72
+
if optimize:
73
73
+
final_regex = _optimal_join(all_terms)
74
74
+
else:
75
75
+
final_regex = _simple_join(all_terms)
76
76
+
77
77
+
return filter_group.add_filter({method: [field, final_regex, ignore_case]})
78
78
+
79
79
+
80
80
+
def _read_term_file(file_path: str) -> list[str]:
81
81
+
path = Path(file_path)
82
82
+
if not path.exists():
83
83
+
raise FileNotFoundError(f"{file_path} not found")
84
84
+
85
85
+
terms = []
86
86
+
with path.open("r", encoding="utf-8") as f:
87
87
+
for line in f:
88
88
+
line = line.strip()
89
89
+
if not line:
90
90
+
continue
91
91
+
92
92
+
terms.append(line)
93
93
+
94
94
+
return terms
95
95
+
96
96
+
97
97
+
def _simple_join(terms: list[str]) -> str:
98
98
+
if len(terms) == 1:
99
99
+
return terms[0]
100
100
+
return f"(?:{'|'.join(terms)})"
101
101
+
102
102
+
103
103
+
def _optimal_join(terms: list[str]) -> str:
104
104
+
# TODO: actually do optimization lol
105
105
+
if len(terms) == 1:
106
106
+
return terms[0]
107
107
+
return f"(?:{'|'.join(terms)})"