Git fork
1#ifndef BLOOM_H
2#define BLOOM_H
3
4struct commit;
5struct repository;
6struct commit_graph;
7
8struct bloom_filter_settings {
9 /*
10 * The version of the hashing technique being used.
11 * The newest version is 2, which is
12 * the seeded murmur3 hashing technique implemented
13 * in bloom.c. Bloom filters of version 1 were created
14 * with prior versions of Git, which had a bug in the
15 * implementation of the hash function.
16 */
17 uint32_t hash_version;
18
19 /*
20 * The number of times a path is hashed, i.e. the
21 * number of bit positions that cumulatively
22 * determine whether a path is present in the
23 * Bloom filter.
24 */
25 uint32_t num_hashes;
26
27 /*
28 * The minimum number of bits per entry in the Bloom
29 * filter. If the filter contains 'n' entries, then
30 * filter size is the minimum number of 8-bit words
31 * that contain n*b bits.
32 */
33 uint32_t bits_per_entry;
34
35 /*
36 * The maximum number of changed paths per commit
37 * before declaring a Bloom filter to be too-large.
38 *
39 * Not written to the commit-graph file.
40 */
41 uint32_t max_changed_paths;
42};
43
44#define DEFAULT_BLOOM_MAX_CHANGES 512
45#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10, DEFAULT_BLOOM_MAX_CHANGES }
46#define BITS_PER_WORD 8
47#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t)
48
49/*
50 * A bloom_filter struct represents a data segment to
51 * use when testing hash values. The 'len' member
52 * dictates how many entries are stored in
53 * 'data'.
54 */
55struct bloom_filter {
56 unsigned char *data;
57 size_t len;
58 int version;
59
60 void *to_free;
61};
62
63/*
64 * A bloom_key represents the k hash values for a
65 * given string. These can be precomputed and
66 * stored in a bloom_key for re-use when testing
67 * against a bloom_filter. The number of hashes is
68 * given by the Bloom filter settings and is the same
69 * for all Bloom filters and keys interacting with
70 * the loaded version of the commit graph file and
71 * the Bloom data chunks.
72 */
73struct bloom_key {
74 uint32_t *hashes;
75};
76
77/*
78 * A bloom_keyvec is a vector of bloom_keys, which
79 * can be used to store multiple keys for a single
80 * pathspec item.
81 */
82struct bloom_keyvec {
83 size_t count;
84 struct bloom_key key[FLEX_ARRAY];
85};
86
87int load_bloom_filter_from_graph(struct commit_graph *g,
88 struct bloom_filter *filter,
89 uint32_t graph_pos);
90
91void bloom_key_fill(struct bloom_key *key, const char *data, size_t len,
92 const struct bloom_filter_settings *settings);
93void bloom_key_clear(struct bloom_key *key);
94
95/*
96 * bloom_keyvec_new - Allocate and populate a bloom_keyvec with keys for the
97 * given path.
98 *
99 * This function splits the input path by '/' and generates a bloom key for each
100 * prefix, in reverse order of specificity. For example, given the input
101 * "a/b/c", it will generate bloom keys for:
102 * - "a/b/c"
103 * - "a/b"
104 * - "a"
105 *
106 * The resulting keys are stored in a newly allocated bloom_keyvec.
107 */
108struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len,
109 const struct bloom_filter_settings *settings);
110void bloom_keyvec_free(struct bloom_keyvec *vec);
111
112void add_key_to_filter(const struct bloom_key *key,
113 struct bloom_filter *filter,
114 const struct bloom_filter_settings *settings);
115
116void init_bloom_filters(void);
117void deinit_bloom_filters(void);
118
119enum bloom_filter_computed {
120 BLOOM_NOT_COMPUTED = (1 << 0),
121 BLOOM_COMPUTED = (1 << 1),
122 BLOOM_TRUNC_LARGE = (1 << 2),
123 BLOOM_TRUNC_EMPTY = (1 << 3),
124 BLOOM_UPGRADED = (1 << 4),
125};
126
127struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
128 struct commit *c,
129 int compute_if_not_present,
130 const struct bloom_filter_settings *settings,
131 enum bloom_filter_computed *computed);
132
133/*
134 * Find the Bloom filter associated with the given commit "c".
135 *
136 * If any of the following are true
137 *
138 * - the repository does not have a commit-graph, or
139 * - the repository disables reading from the commit-graph, or
140 * - the given commit does not have a Bloom filter computed, or
141 * - there is a Bloom filter for commit "c", but it cannot be read
142 * because the filter uses an incompatible version of murmur3
143 *
144 * , then `get_bloom_filter()` will return NULL. Otherwise, the corresponding
145 * Bloom filter will be returned.
146 *
147 * For callers who wish to inspect Bloom filters with incompatible hash
148 * versions, use get_or_compute_bloom_filter().
149 */
150struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c);
151
152int bloom_filter_contains(const struct bloom_filter *filter,
153 const struct bloom_key *key,
154 const struct bloom_filter_settings *settings);
155
156/*
157 * bloom_filter_contains_vec - Check if all keys in a key vector are in the
158 * Bloom filter.
159 *
160 * Returns 1 if **all** keys in the vector are present in the filter,
161 * 0 if **any** key is not present.
162 */
163int bloom_filter_contains_vec(const struct bloom_filter *filter,
164 const struct bloom_keyvec *v,
165 const struct bloom_filter_settings *settings);
166
167uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
168 int version);
169
170#endif