Git fork

bloom: replace struct bloom_key * with struct bloom_keyvec

Previously, we stored bloom keys in a flat array and marked a commit
as NOT TREESAME if any key reported "definitely not changed".

To support multiple pathspec items, we now require that for each
pathspec item, there exists a bloom key reporting "definitely not
changed".

This "for every" condition makes a flat array insufficient, so we
introduce a new structure to group keys by a single pathspec item.
`struct bloom_keyvec` is introduced to replace `struct bloom_key *`
and `bloom_key_nr`. And because we want to support multiple pathspec
items, we added a bloom_keyvec * and a bloom_keyvec_nr field to
`struct rev_info` to represent an array of bloom_keyvecs. This commit
still optimize only one pathspec item, thus bloom_keyvec_nr can only
be 0 or 1.

New bloom_keyvec_* functions are added to create and destroy a keyvec.
bloom_filter_contains_vec() is added to check if all key in keyvec is
contained in a bloom filter.

Signed-off-by: Lidong Yan <502024330056@smail.nju.edu.cn>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

authored by

Lidong Yan and committed by
Junio C Hamano
90d5518a b187353e

+132 -49
+61
bloom.c
··· 278 278 deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter); 279 279 } 280 280 281 + struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len, 282 + const struct bloom_filter_settings *settings) 283 + { 284 + struct bloom_keyvec *vec; 285 + const char *p; 286 + size_t sz; 287 + size_t nr = 1; 288 + 289 + p = path; 290 + while (*p) { 291 + /* 292 + * At this point, the path is normalized to use Unix-style 293 + * path separators. This is required due to how the 294 + * changed-path Bloom filters store the paths. 295 + */ 296 + if (*p == '/') 297 + nr++; 298 + p++; 299 + } 300 + 301 + sz = sizeof(struct bloom_keyvec); 302 + sz += nr * sizeof(struct bloom_key); 303 + vec = (struct bloom_keyvec *)xcalloc(1, sz); 304 + if (!vec) 305 + return NULL; 306 + vec->count = nr; 307 + 308 + bloom_key_fill(&vec->key[0], path, len, settings); 309 + nr = 1; 310 + p = path + len - 1; 311 + while (p > path) { 312 + if (*p == '/') { 313 + bloom_key_fill(&vec->key[nr++], path, p - path, settings); 314 + } 315 + p--; 316 + } 317 + assert(nr == vec->count); 318 + return vec; 319 + } 320 + 321 + void bloom_keyvec_free(struct bloom_keyvec *vec) 322 + { 323 + if (!vec) 324 + return; 325 + for (size_t nr = 0; nr < vec->count; nr++) 326 + bloom_key_clear(&vec->key[nr]); 327 + free(vec); 328 + } 329 + 281 330 static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED, 282 331 const struct hashmap_entry *eptr, 283 332 const struct hashmap_entry *entry_or_key, ··· 537 586 } 538 587 539 588 return 1; 589 + } 590 + 591 + int bloom_filter_contains_vec(const struct bloom_filter *filter, 592 + const struct bloom_keyvec *vec, 593 + const struct bloom_filter_settings *settings) 594 + { 595 + int ret = 1; 596 + 597 + for (size_t nr = 0; ret > 0 && nr < vec->count; nr++) 598 + ret = bloom_filter_contains(filter, &vec->key[nr], settings); 599 + 600 + return ret; 540 601 } 541 602 542 603 uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
+38
bloom.h
··· 74 74 uint32_t *hashes; 75 75 }; 76 76 77 + /* 78 + * A bloom_keyvec is a vector of bloom_keys, which 79 + * can be used to store multiple keys for a single 80 + * pathspec item. 81 + */ 82 + struct bloom_keyvec { 83 + size_t count; 84 + struct bloom_key key[FLEX_ARRAY]; 85 + }; 86 + 77 87 int load_bloom_filter_from_graph(struct commit_graph *g, 78 88 struct bloom_filter *filter, 79 89 uint32_t graph_pos); ··· 82 92 const struct bloom_filter_settings *settings); 83 93 void bloom_key_clear(struct bloom_key *key); 84 94 95 + /* 96 + * bloom_keyvec_new - Allocate and populate a bloom_keyvec with keys for the 97 + * given path. 98 + * 99 + * This function splits the input path by '/' and generates a bloom key for each 100 + * prefix, in reverse order of specificity. For example, given the input 101 + * "a/b/c", it will generate bloom keys for: 102 + * - "a/b/c" 103 + * - "a/b" 104 + * - "a" 105 + * 106 + * The resulting keys are stored in a newly allocated bloom_keyvec. 107 + */ 108 + struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len, 109 + const struct bloom_filter_settings *settings); 110 + void bloom_keyvec_free(struct bloom_keyvec *vec); 111 + 85 112 void add_key_to_filter(const struct bloom_key *key, 86 113 struct bloom_filter *filter, 87 114 const struct bloom_filter_settings *settings); ··· 125 152 int bloom_filter_contains(const struct bloom_filter *filter, 126 153 const struct bloom_key *key, 127 154 const struct bloom_filter_settings *settings); 155 + 156 + /* 157 + * bloom_filter_contains_vec - Check if all keys in a key vector are in the 158 + * Bloom filter. 159 + * 160 + * Returns 1 if **all** keys in the vector are present in the filter, 161 + * 0 if **any** key is not present. 162 + */ 163 + int bloom_filter_contains_vec(const struct bloom_filter *filter, 164 + const struct bloom_keyvec *v, 165 + const struct bloom_filter_settings *settings); 128 166 129 167 uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len, 130 168 int version);
+30 -46
revision.c
··· 685 685 return 0; 686 686 } 687 687 688 + static void release_revisions_bloom_keyvecs(struct rev_info *revs); 689 + 688 690 static void prepare_to_use_bloom_filter(struct rev_info *revs) 689 691 { 690 692 struct pathspec_item *pi; 691 693 char *path_alloc = NULL; 692 - const char *path, *p; 694 + const char *path; 693 695 size_t len; 694 - int path_component_nr = 1; 695 696 696 697 if (!revs->commits) 697 698 return; ··· 708 709 if (!revs->pruning.pathspec.nr) 709 710 return; 710 711 712 + revs->bloom_keyvecs_nr = 1; 713 + CALLOC_ARRAY(revs->bloom_keyvecs, 1); 711 714 pi = &revs->pruning.pathspec.items[0]; 712 715 713 716 /* remove single trailing slash from path, if needed */ ··· 718 721 path = pi->match; 719 722 720 723 len = strlen(path); 721 - if (!len) { 722 - revs->bloom_filter_settings = NULL; 723 - free(path_alloc); 724 - return; 725 - } 726 - 727 - p = path; 728 - while (*p) { 729 - /* 730 - * At this point, the path is normalized to use Unix-style 731 - * path separators. This is required due to how the 732 - * changed-path Bloom filters store the paths. 733 - */ 734 - if (*p == '/') 735 - path_component_nr++; 736 - p++; 737 - } 738 - 739 - revs->bloom_keys_nr = path_component_nr; 740 - ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr); 741 - 742 - bloom_key_fill(&revs->bloom_keys[0], path, len, 743 - revs->bloom_filter_settings); 744 - path_component_nr = 1; 724 + if (!len) 725 + goto fail; 745 726 746 - p = path + len - 1; 747 - while (p > path) { 748 - if (*p == '/') 749 - bloom_key_fill(&revs->bloom_keys[path_component_nr++], 750 - path, p - path, 751 - revs->bloom_filter_settings); 752 - p--; 753 - } 727 + revs->bloom_keyvecs[0] = 728 + bloom_keyvec_new(path, len, revs->bloom_filter_settings); 754 729 755 730 if (trace2_is_enabled() && !bloom_filter_atexit_registered) { 756 731 atexit(trace2_bloom_filter_statistics_atexit); 757 732 bloom_filter_atexit_registered = 1; 758 733 } 759 734 735 + return; 736 + 737 + fail: 738 + revs->bloom_filter_settings = NULL; 760 739 free(path_alloc); 740 + release_revisions_bloom_keyvecs(revs); 761 741 } 762 742 763 743 static int check_maybe_different_in_bloom_filter(struct rev_info *revs, 764 744 struct commit *commit) 765 745 { 766 746 struct bloom_filter *filter; 767 - int result = 1, j; 747 + int result = 0; 768 748 769 749 if (!revs->repo->objects->commit_graph) 770 750 return -1; ··· 779 759 return -1; 780 760 } 781 761 782 - for (j = 0; result && j < revs->bloom_keys_nr; j++) { 783 - result = bloom_filter_contains(filter, 784 - &revs->bloom_keys[j], 785 - revs->bloom_filter_settings); 762 + for (size_t nr = 0; !result && nr < revs->bloom_keyvecs_nr; nr++) { 763 + result = bloom_filter_contains_vec(filter, 764 + revs->bloom_keyvecs[nr], 765 + revs->bloom_filter_settings); 786 766 } 787 767 788 768 if (result) ··· 823 803 return REV_TREE_SAME; 824 804 } 825 805 826 - if (revs->bloom_keys_nr && !nth_parent) { 806 + if (revs->bloom_keyvecs_nr && !nth_parent) { 827 807 bloom_ret = check_maybe_different_in_bloom_filter(revs, commit); 828 808 829 809 if (bloom_ret == 0) ··· 850 830 if (!t1) 851 831 return 0; 852 832 853 - if (!nth_parent && revs->bloom_keys_nr) { 833 + if (!nth_parent && revs->bloom_keyvecs_nr) { 854 834 bloom_ret = check_maybe_different_in_bloom_filter(revs, commit); 855 835 if (!bloom_ret) 856 836 return 1; ··· 3200 3180 3201 3181 static void release_revisions_topo_walk_info(struct topo_walk_info *info); 3202 3182 3183 + static void release_revisions_bloom_keyvecs(struct rev_info *revs) 3184 + { 3185 + for (size_t nr = 0; nr < revs->bloom_keyvecs_nr; nr++) 3186 + bloom_keyvec_free(revs->bloom_keyvecs[nr]); 3187 + FREE_AND_NULL(revs->bloom_keyvecs); 3188 + revs->bloom_keyvecs_nr = 0; 3189 + } 3190 + 3203 3191 static void free_void_commit_list(void *list) 3204 3192 { 3205 3193 free_commit_list(list); ··· 3228 3216 clear_decoration(&revs->treesame, free); 3229 3217 line_log_free(revs); 3230 3218 oidset_clear(&revs->missing_commits); 3231 - 3232 - for (int i = 0; i < revs->bloom_keys_nr; i++) 3233 - bloom_key_clear(&revs->bloom_keys[i]); 3234 - FREE_AND_NULL(revs->bloom_keys); 3235 - revs->bloom_keys_nr = 0; 3219 + release_revisions_bloom_keyvecs(revs); 3236 3220 } 3237 3221 3238 3222 static void add_child(struct rev_info *revs, struct commit *parent, struct commit *child)
+3 -3
revision.h
··· 62 62 struct rev_info; 63 63 struct string_list; 64 64 struct saved_parents; 65 - struct bloom_key; 65 + struct bloom_keyvec; 66 66 struct bloom_filter_settings; 67 67 struct option; 68 68 struct parse_opt_ctx_t; ··· 360 360 361 361 /* Commit graph bloom filter fields */ 362 362 /* The bloom filter key(s) for the pathspec */ 363 - struct bloom_key *bloom_keys; 364 - int bloom_keys_nr; 363 + struct bloom_keyvec **bloom_keyvecs; 364 + int bloom_keyvecs_nr; 365 365 366 366 /* 367 367 * The bloom filter settings used to generate the key.