tangled
alpha
login
or
join now
hailey.at
/
followgraph
0
fork
atom
this repo has no description
0
fork
atom
overview
issues
pulls
pipelines
add graph builder
hailey.at
2 months ago
df6756de
d697bcb3
+108
-1
2 changed files
expand all
collapse all
unified
split
build_graph.py
indexer.py
+84
build_graph.py
···
1
1
+
from collections import UserString
2
2
+
import logging
3
3
+
from typing import Dict, Optional, Set
4
4
+
5
5
+
import click
6
6
+
7
7
+
from config import CONFIG
8
8
+
from indexer import FollowIndexer
9
9
+
import indexer
10
10
+
11
11
+
12
12
+
logging.basicConfig(
13
13
+
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
14
14
+
)
15
15
+
16
16
+
logger = logging.getLogger(__name__)
17
17
+
18
18
+
19
19
+
@click.command
20
20
+
@click.option(
21
21
+
"--ch-host",
22
22
+
)
23
23
+
@click.option(
24
24
+
"--ch-port",
25
25
+
type=int,
26
26
+
)
27
27
+
@click.option(
28
28
+
"--ch-user",
29
29
+
)
30
30
+
@click.option(
31
31
+
"--ch-pass",
32
32
+
)
33
33
+
def main(
34
34
+
ch_host: Optional[str],
35
35
+
ch_port: Optional[int],
36
36
+
ch_user: Optional[str],
37
37
+
ch_pass: Optional[str],
38
38
+
):
39
39
+
logger.info("Building follow graph...")
40
40
+
41
41
+
indexer = FollowIndexer(
42
42
+
clickhouse_host=ch_host or CONFIG.clickhouse_host,
43
43
+
clickhouse_port=ch_port or CONFIG.clickhouse_port,
44
44
+
clickhouse_user=ch_user or CONFIG.clickhouse_user,
45
45
+
clickhouse_pass=ch_pass or CONFIG.clickhouse_pass,
46
46
+
batch_size=1000,
47
47
+
)
48
48
+
49
49
+
graph: Dict[str, Set[str]] = {}
50
50
+
51
51
+
def build_graph(did: str, subject: str):
52
52
+
if did not in graph:
53
53
+
graph[did] = set()
54
54
+
55
55
+
graph[did].add(subject)
56
56
+
57
57
+
indexer.stream_follows(build_graph)
58
58
+
59
59
+
prox_map = {}
60
60
+
61
61
+
for did in graph:
62
62
+
first = graph.get(did, set())
63
63
+
64
64
+
second: Set[str] = set()
65
65
+
for subject in first:
66
66
+
second.update(graph.get(subject, set()))
67
67
+
68
68
+
prox_map[did] = {
69
69
+
"hop1": first,
70
70
+
"hop2": second - first - {did},
71
71
+
}
72
72
+
73
73
+
import pickle
74
74
+
75
75
+
with open("prox_map.pkl", "wb") as f:
76
76
+
pickle.dump(prox_map, f)
77
77
+
78
78
+
logger.info(
79
79
+
f"Finished building proximity map, saved to prox_map.pkl. {len(prox_map):,} users in map."
80
80
+
)
81
81
+
82
82
+
83
83
+
if __name__ == "__main__":
84
84
+
main()
+24
-1
indexer.py
···
4
4
from datetime import datetime
5
5
from threading import Lock
6
6
from time import time
7
7
-
from typing import Any, List, Optional
7
7
+
from typing import Any, Callable, List, Optional
8
8
9
9
import click
10
10
from aiokafka import AIOKafkaConsumer, ConsumerRecord
···
184
184
batch_to_flush = self._unfollow_batch.copy()
185
185
self._unfollow_batch = []
186
186
self._flush_unfollows(batch_to_flush)
187
187
+
188
188
+
def stream_follows(self, cb: Callable[[str, str], None], batch_size: int = 100_000):
189
189
+
query = """
190
190
+
SELECT f.did, f.subject
191
191
+
FROM follows f
192
192
+
LEFT ANTI JOIN unfollows u ON f.uri = u.uri
193
193
+
"""
194
194
+
195
195
+
try:
196
196
+
with self.client.query_row_block_stream(
197
197
+
query, settings={"max_block_size": batch_size}
198
198
+
) as stream:
199
199
+
total_handled = 0
200
200
+
for block in stream:
201
201
+
for row in block:
202
202
+
cb(row[0], row[1])
203
203
+
total_handled += 1
204
204
+
205
205
+
if total_handled % 1_000_000 == 0:
206
206
+
logger.info(f"Handled {total_handled:,} follows so far")
207
207
+
logger.info(f"Finished streaming {total_handled:,} follows")
208
208
+
except Exception as e:
209
209
+
logger.error(f"Error streaming follows: {e}")
187
210
188
211
189
212
class Consumer: