Git fork

list-objects-filter: implement composite filters

Allow combining filters such that only objects accepted by all filters
are shown. The motivation for this is to allow getting directory
listings without also fetching blobs. This can be done by combining
blob:none with tree:<depth>. There are massive repositories that have
larger-than-expected trees - even if you include only a single commit.

A combined filter supports any number of subfilters, and is written in
the following form:

combine:<filter 1>+<filter 2>+<filter 3>

Certain non-alphanumeric characters in each filter must be
URL-encoded.

For now, combined filters must be specified in this form. In a
subsequent commit, rev-list will support multiple --filter arguments
which will have the same effect as specifying one filter argument
starting with "combine:". The documentation will be updated in that
commit, as the URL-encoding scheme is in general not meant to be used
directly by the user, and it is better to describe the URL-encoding
feature in terms of the repeated flag.

Helped-by: Emily Shaffer <emilyshaffer@google.com>
Helped-by: Jeff Hostetler <git@jeffhostetler.com>
Helped-by: Johannes Schindelin <Johannes.Schindelin@gmx.de>
Helped-by: Jonathan Tan <jonathantanmy@google.com>
Helped-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Matthew DeVore <matvore@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

authored by

Matthew DeVore and committed by
Junio C Hamano
e987df5f 842b0051

+454 -8
+104 -2
list-objects-filter-options.c
··· 6 6 #include "list-objects.h" 7 7 #include "list-objects-filter.h" 8 8 #include "list-objects-filter-options.h" 9 + #include "url.h" 10 + 11 + static int parse_combine_filter( 12 + struct list_objects_filter_options *filter_options, 13 + const char *arg, 14 + struct strbuf *errbuf); 9 15 10 16 /* 11 17 * Parse value of the argument to the "filter" keyword. ··· 35 41 return 1; 36 42 } 37 43 38 - filter_options->filter_spec = strdup(arg); 39 - 40 44 if (!strcmp(arg, "blob:none")) { 41 45 filter_options->choice = LOFC_BLOB_NONE; 42 46 return 0; ··· 77 81 _("sparse:path filters support has been dropped")); 78 82 } 79 83 return 1; 84 + 85 + } else if (skip_prefix(arg, "combine:", &v0)) { 86 + return parse_combine_filter(filter_options, v0, errbuf); 87 + 80 88 } 81 89 /* 82 90 * Please update _git_fetch() in git-completion.bash when you ··· 89 97 return 1; 90 98 } 91 99 100 + static const char *RESERVED_NON_WS = "~`!@#$^&*()[]{}\\;'\",<>?"; 101 + 102 + static int has_reserved_character( 103 + struct strbuf *sub_spec, struct strbuf *errbuf) 104 + { 105 + const char *c = sub_spec->buf; 106 + while (*c) { 107 + if (*c <= ' ' || strchr(RESERVED_NON_WS, *c)) { 108 + strbuf_addf( 109 + errbuf, 110 + _("must escape char in sub-filter-spec: '%c'"), 111 + *c); 112 + return 1; 113 + } 114 + c++; 115 + } 116 + 117 + return 0; 118 + } 119 + 120 + static int parse_combine_subfilter( 121 + struct list_objects_filter_options *filter_options, 122 + struct strbuf *subspec, 123 + struct strbuf *errbuf) 124 + { 125 + size_t new_index = filter_options->sub_nr++; 126 + char *decoded; 127 + int result; 128 + 129 + ALLOC_GROW(filter_options->sub, filter_options->sub_nr, 130 + filter_options->sub_alloc); 131 + memset(&filter_options->sub[new_index], 0, 132 + sizeof(*filter_options->sub)); 133 + 134 + decoded = url_percent_decode(subspec->buf); 135 + 136 + result = has_reserved_character(subspec, errbuf) || 137 + gently_parse_list_objects_filter( 138 + &filter_options->sub[new_index], decoded, errbuf); 139 + 140 + free(decoded); 141 + return result; 142 + } 143 + 144 + static int parse_combine_filter( 145 + struct list_objects_filter_options *filter_options, 146 + const char *arg, 147 + struct strbuf *errbuf) 148 + { 149 + struct strbuf **subspecs = strbuf_split_str(arg, '+', 0); 150 + size_t sub; 151 + int result = 0; 152 + 153 + if (!subspecs[0]) { 154 + strbuf_addstr(errbuf, _("expected something after combine:")); 155 + result = 1; 156 + goto cleanup; 157 + } 158 + 159 + for (sub = 0; subspecs[sub] && !result; sub++) { 160 + if (subspecs[sub + 1]) { 161 + /* 162 + * This is not the last subspec. Remove trailing "+" so 163 + * we can parse it. 164 + */ 165 + size_t last = subspecs[sub]->len - 1; 166 + assert(subspecs[sub]->buf[last] == '+'); 167 + strbuf_remove(subspecs[sub], last, 1); 168 + } 169 + result = parse_combine_subfilter( 170 + filter_options, subspecs[sub], errbuf); 171 + } 172 + 173 + filter_options->choice = LOFC_COMBINE; 174 + 175 + cleanup: 176 + strbuf_list_free(subspecs); 177 + if (result) { 178 + list_objects_filter_release(filter_options); 179 + memset(filter_options, 0, sizeof(*filter_options)); 180 + } 181 + return result; 182 + } 183 + 92 184 int parse_list_objects_filter(struct list_objects_filter_options *filter_options, 93 185 const char *arg) 94 186 { 95 187 struct strbuf buf = STRBUF_INIT; 188 + filter_options->filter_spec = strdup(arg); 96 189 if (gently_parse_list_objects_filter(filter_options, arg, &buf)) 97 190 die("%s", buf.buf); 98 191 return 0; ··· 129 222 void list_objects_filter_release( 130 223 struct list_objects_filter_options *filter_options) 131 224 { 225 + size_t sub; 226 + 227 + if (!filter_options) 228 + return; 132 229 free(filter_options->filter_spec); 133 230 free(filter_options->sparse_oid_value); 231 + for (sub = 0; sub < filter_options->sub_nr; sub++) 232 + list_objects_filter_release(&filter_options->sub[sub]); 233 + free(filter_options->sub); 134 234 memset(filter_options, 0, sizeof(*filter_options)); 135 235 } 136 236 ··· 174 274 */ 175 275 if (!core_partial_clone_filter_default) 176 276 return; 277 + 278 + filter_options->filter_spec = strdup(core_partial_clone_filter_default); 177 279 gently_parse_list_objects_filter(filter_options, 178 280 core_partial_clone_filter_default, 179 281 &errbuf);
+14 -3
list-objects-filter-options.h
··· 13 13 LOFC_BLOB_LIMIT, 14 14 LOFC_TREE_DEPTH, 15 15 LOFC_SPARSE_OID, 16 + LOFC_COMBINE, 16 17 LOFC__COUNT /* must be last */ 17 18 }; 18 19 ··· 38 39 unsigned int no_filter : 1; 39 40 40 41 /* 41 - * Parsed values (fields) from within the filter-spec. These are 42 - * choice-specific; not all values will be defined for any given 43 - * choice. 42 + * BEGIN choice-specific parsed values from within the filter-spec. Only 43 + * some values will be defined for any given choice. 44 44 */ 45 + 45 46 struct object_id *sparse_oid_value; 46 47 unsigned long blob_limit_value; 47 48 unsigned long tree_exclude_depth; 49 + 50 + /* LOFC_COMBINE values */ 51 + 52 + /* This array contains all the subfilters which this filter combines. */ 53 + size_t sub_nr, sub_alloc; 54 + struct list_objects_filter_options *sub; 55 + 56 + /* 57 + * END choice-specific parsed values. 58 + */ 48 59 }; 49 60 50 61 /* Normalized command line arguments */
+161
list-objects-filter.c
··· 26 26 */ 27 27 #define FILTER_SHOWN_BUT_REVISIT (1<<21) 28 28 29 + struct subfilter { 30 + struct filter *filter; 31 + struct oidset seen; 32 + struct oidset omits; 33 + struct object_id skip_tree; 34 + unsigned is_skipping_tree : 1; 35 + }; 36 + 29 37 struct filter { 30 38 enum list_objects_filter_result (*filter_object_fn)( 31 39 struct repository *r, ··· 35 43 const char *filename, 36 44 struct oidset *omits, 37 45 void *filter_data); 46 + 47 + /* 48 + * Optional. If this function is supplied and the filter needs 49 + * to collect omits, then this function is called once before 50 + * free_fn is called. 51 + * 52 + * This is required because the following two conditions hold: 53 + * 54 + * a. A tree filter can add and remove objects as an object 55 + * graph is traversed. 56 + * b. A combine filter's omit set is the union of all its 57 + * subfilters, which may include tree: filters. 58 + * 59 + * As such, the omits sets must be separate sets, and can only 60 + * be unioned after the traversal is completed. 61 + */ 62 + void (*finalize_omits_fn)(struct oidset *omits, void *filter_data); 38 63 39 64 void (*free_fn)(void *filter_data); 40 65 ··· 471 496 filter->free_fn = filter_sparse_free; 472 497 } 473 498 499 + /* A filter which only shows objects shown by all sub-filters. */ 500 + struct combine_filter_data { 501 + struct subfilter *sub; 502 + size_t nr; 503 + }; 504 + 505 + static enum list_objects_filter_result process_subfilter( 506 + struct repository *r, 507 + enum list_objects_filter_situation filter_situation, 508 + struct object *obj, 509 + const char *pathname, 510 + const char *filename, 511 + struct subfilter *sub) 512 + { 513 + enum list_objects_filter_result result; 514 + 515 + /* 516 + * Check and update is_skipping_tree before oidset_contains so 517 + * that is_skipping_tree gets unset even when the object is 518 + * marked as seen. As of this writing, no filter uses 519 + * LOFR_MARK_SEEN on trees that also uses LOFR_SKIP_TREE, so the 520 + * ordering is only theoretically important. Be cautious if you 521 + * change the order of the below checks and more filters have 522 + * been added! 523 + */ 524 + if (sub->is_skipping_tree) { 525 + if (filter_situation == LOFS_END_TREE && 526 + oideq(&obj->oid, &sub->skip_tree)) 527 + sub->is_skipping_tree = 0; 528 + else 529 + return LOFR_ZERO; 530 + } 531 + if (oidset_contains(&sub->seen, &obj->oid)) 532 + return LOFR_ZERO; 533 + 534 + result = list_objects_filter__filter_object( 535 + r, filter_situation, obj, pathname, filename, sub->filter); 536 + 537 + if (result & LOFR_MARK_SEEN) 538 + oidset_insert(&sub->seen, &obj->oid); 539 + 540 + if (result & LOFR_SKIP_TREE) { 541 + sub->is_skipping_tree = 1; 542 + sub->skip_tree = obj->oid; 543 + } 544 + 545 + return result; 546 + } 547 + 548 + static enum list_objects_filter_result filter_combine( 549 + struct repository *r, 550 + enum list_objects_filter_situation filter_situation, 551 + struct object *obj, 552 + const char *pathname, 553 + const char *filename, 554 + struct oidset *omits, 555 + void *filter_data) 556 + { 557 + struct combine_filter_data *d = filter_data; 558 + enum list_objects_filter_result combined_result = 559 + LOFR_DO_SHOW | LOFR_MARK_SEEN | LOFR_SKIP_TREE; 560 + size_t sub; 561 + 562 + for (sub = 0; sub < d->nr; sub++) { 563 + enum list_objects_filter_result sub_result = process_subfilter( 564 + r, filter_situation, obj, pathname, filename, 565 + &d->sub[sub]); 566 + if (!(sub_result & LOFR_DO_SHOW)) 567 + combined_result &= ~LOFR_DO_SHOW; 568 + if (!(sub_result & LOFR_MARK_SEEN)) 569 + combined_result &= ~LOFR_MARK_SEEN; 570 + if (!d->sub[sub].is_skipping_tree) 571 + combined_result &= ~LOFR_SKIP_TREE; 572 + } 573 + 574 + return combined_result; 575 + } 576 + 577 + static void filter_combine__free(void *filter_data) 578 + { 579 + struct combine_filter_data *d = filter_data; 580 + size_t sub; 581 + for (sub = 0; sub < d->nr; sub++) { 582 + list_objects_filter__free(d->sub[sub].filter); 583 + oidset_clear(&d->sub[sub].seen); 584 + if (d->sub[sub].omits.set.size) 585 + BUG("expected oidset to be cleared already"); 586 + } 587 + free(d->sub); 588 + } 589 + 590 + static void add_all(struct oidset *dest, struct oidset *src) { 591 + struct oidset_iter iter; 592 + struct object_id *src_oid; 593 + 594 + oidset_iter_init(src, &iter); 595 + while ((src_oid = oidset_iter_next(&iter)) != NULL) 596 + oidset_insert(dest, src_oid); 597 + } 598 + 599 + static void filter_combine__finalize_omits( 600 + struct oidset *omits, 601 + void *filter_data) 602 + { 603 + struct combine_filter_data *d = filter_data; 604 + size_t sub; 605 + 606 + for (sub = 0; sub < d->nr; sub++) { 607 + add_all(omits, &d->sub[sub].omits); 608 + oidset_clear(&d->sub[sub].omits); 609 + } 610 + } 611 + 612 + static void filter_combine__init( 613 + struct list_objects_filter_options *filter_options, 614 + struct filter* filter) 615 + { 616 + struct combine_filter_data *d = xcalloc(1, sizeof(*d)); 617 + size_t sub; 618 + 619 + d->nr = filter_options->sub_nr; 620 + d->sub = xcalloc(d->nr, sizeof(*d->sub)); 621 + for (sub = 0; sub < d->nr; sub++) 622 + d->sub[sub].filter = list_objects_filter__init( 623 + filter->omits ? &d->sub[sub].omits : NULL, 624 + &filter_options->sub[sub]); 625 + 626 + filter->filter_data = d; 627 + filter->filter_object_fn = filter_combine; 628 + filter->free_fn = filter_combine__free; 629 + filter->finalize_omits_fn = filter_combine__finalize_omits; 630 + } 631 + 474 632 typedef void (*filter_init_fn)( 475 633 struct list_objects_filter_options *filter_options, 476 634 struct filter *filter); ··· 484 642 filter_blobs_limit__init, 485 643 filter_trees_depth__init, 486 644 filter_sparse_oid__init, 645 + filter_combine__init, 487 646 }; 488 647 489 648 struct filter *list_objects_filter__init( ··· 536 695 { 537 696 if (!filter) 538 697 return; 698 + if (filter->finalize_omits_fn && filter->omits) 699 + filter->finalize_omits_fn(filter->omits, filter->filter_data); 539 700 filter->free_fn(filter->filter_data); 540 701 free(filter); 541 702 }
+11 -2
list-objects-filter.h
··· 62 62 63 63 struct filter; 64 64 65 - /* Constructor for the set of defined list-objects filters. */ 65 + /* 66 + * Constructor for the set of defined list-objects filters. 67 + * The `omitted` set is optional. It is populated with objects that the 68 + * filter excludes. This set should not be considered finalized until 69 + * after list_objects_filter__free is called on the returned `struct 70 + * filter *`. 71 + */ 66 72 struct filter *list_objects_filter__init( 67 73 struct oidset *omitted, 68 74 struct list_objects_filter_options *filter_options); ··· 80 86 const char *filename, 81 87 struct filter *filter); 82 88 83 - /* Destroys `filter`. Does nothing if `filter` is null. */ 89 + /* 90 + * Destroys `filter` and finalizes the `omitted` set, if present. Does 91 + * nothing if `filter` is null. 92 + */ 84 93 void list_objects_filter__free(struct filter *filter); 85 94 86 95 #endif /* LIST_OBJECTS_FILTER_H */
+150 -1
t/t6112-rev-list-filters-objects.sh
··· 278 278 test_line_count = 2 actual && 279 279 280 280 # Make sure no other trees were considered besides the root. 281 - ! grep "Skipping contents of tree [^.]" filter_trace 281 + ! grep "Skipping contents of tree [^.]" filter_trace && 282 + 283 + # Try this again with "combine:". If both sub-filters are skipping 284 + # trees, the composite filter should also skip trees. This is not 285 + # important unless the user does combine:tree:X+tree:Y or another filter 286 + # besides "tree:" is implemented in the future which can skip trees. 287 + GIT_TRACE=1 git -C r3 rev-list \ 288 + --objects --filter=combine:tree:1+tree:3 HEAD 2>filter_trace && 289 + 290 + # Only skip the dir1/ tree, which is shared between the two commits. 291 + grep "Skipping contents of tree " filter_trace >actual && 292 + test_write_lines "Skipping contents of tree dir1/..." >expected && 293 + test_cmp expected actual 282 294 ' 283 295 284 296 # Test tree:# filters. ··· 330 342 test_line_count = 10 actual 331 343 ' 332 344 345 + test_expect_success 'combine:... for a simple combination' ' 346 + git -C r3 rev-list --objects --filter=combine:tree:2+blob:none HEAD \ 347 + >actual && 348 + 349 + expect_has HEAD "" && 350 + expect_has HEAD~1 "" && 351 + expect_has HEAD dir1 && 352 + 353 + # There are also 2 commit objects 354 + test_line_count = 5 actual 355 + ' 356 + 357 + test_expect_success 'combine:... with URL encoding' ' 358 + git -C r3 rev-list --objects \ 359 + --filter=combine:tree%3a2+blob:%6Eon%65 HEAD >actual && 360 + 361 + expect_has HEAD "" && 362 + expect_has HEAD~1 "" && 363 + expect_has HEAD dir1 && 364 + 365 + # There are also 2 commit objects 366 + test_line_count = 5 actual 367 + ' 368 + 369 + expect_invalid_filter_spec () { 370 + spec="$1" && 371 + err="$2" && 372 + 373 + test_must_fail git -C r3 rev-list --objects --filter="$spec" HEAD \ 374 + >actual 2>actual_stderr && 375 + test_must_be_empty actual && 376 + test_i18ngrep "$err" actual_stderr 377 + } 378 + 379 + test_expect_success 'combine:... while URL-encoding things that should not be' ' 380 + expect_invalid_filter_spec combine%3Atree:2+blob:none \ 381 + "invalid filter-spec" 382 + ' 383 + 384 + test_expect_success 'combine: with nothing after the :' ' 385 + expect_invalid_filter_spec combine: "expected something after combine:" 386 + ' 387 + 388 + test_expect_success 'parse error in first sub-filter in combine:' ' 389 + expect_invalid_filter_spec combine:tree:asdf+blob:none \ 390 + "expected .tree:<depth>." 391 + ' 392 + 393 + test_expect_success 'combine:... with non-encoded reserved chars' ' 394 + expect_invalid_filter_spec combine:tree:2+sparse:@xyz \ 395 + "must escape char in sub-filter-spec: .@." && 396 + expect_invalid_filter_spec combine:tree:2+sparse:\` \ 397 + "must escape char in sub-filter-spec: .\`." && 398 + expect_invalid_filter_spec combine:tree:2+sparse:~abc \ 399 + "must escape char in sub-filter-spec: .\~." 400 + ' 401 + 402 + test_expect_success 'validate err msg for "combine:<valid-filter>+"' ' 403 + expect_invalid_filter_spec combine:tree:2+ "expected .tree:<depth>." 404 + ' 405 + 406 + test_expect_success 'combine:... with edge-case hex digits: Ff Aa 0 9' ' 407 + git -C r3 rev-list --objects --filter="combine:tree:2+bl%6Fb:n%6fne" \ 408 + HEAD >actual && 409 + test_line_count = 5 actual && 410 + git -C r3 rev-list --objects --filter="combine:tree%3A2+blob%3anone" \ 411 + HEAD >actual && 412 + test_line_count = 5 actual && 413 + git -C r3 rev-list --objects --filter="combine:tree:%30" HEAD >actual && 414 + test_line_count = 2 actual && 415 + git -C r3 rev-list --objects --filter="combine:tree:%39+blob:none" \ 416 + HEAD >actual && 417 + test_line_count = 5 actual 418 + ' 419 + 420 + test_expect_success 'add a sparse pattern blob whose path has reserved chars' ' 421 + cp r3/pattern r3/pattern1+renamed% && 422 + git -C r3 add pattern1+renamed% && 423 + git -C r3 commit -m "add sparse pattern file with reserved chars" 424 + ' 425 + 426 + test_expect_success 'combine:... with more than two sub-filters' ' 427 + git -C r3 rev-list --objects \ 428 + --filter=combine:tree:3+blob:limit=40+sparse:oid=master:pattern \ 429 + HEAD >actual && 430 + 431 + expect_has HEAD "" && 432 + expect_has HEAD~1 "" && 433 + expect_has HEAD~2 "" && 434 + expect_has HEAD dir1 && 435 + expect_has HEAD dir1/sparse1 && 436 + expect_has HEAD dir1/sparse2 && 437 + 438 + # Should also have 3 commits 439 + test_line_count = 9 actual && 440 + 441 + # Try again, this time making sure the last sub-filter is only 442 + # URL-decoded once. 443 + cp actual expect && 444 + 445 + git -C r3 rev-list --objects \ 446 + --filter=combine:tree:3+blob:limit=40+sparse:oid=master:pattern1%2brenamed%25 \ 447 + HEAD >actual && 448 + test_cmp expect actual 449 + ' 450 + 333 451 # Test provisional omit collection logic with a repo that has objects appearing 334 452 # at multiple depths - first deeper than the filter's threshold, then shallow. 335 453 ··· 371 489 372 490 echo "Skipping contents of tree subdir/..." >expect && 373 491 test_cmp expect actual 492 + ' 493 + 494 + test_expect_success 'setup r5' ' 495 + git init r5 && 496 + mkdir -p r5/subdir && 497 + 498 + echo 1 >r5/short-root && 499 + echo 12345 >r5/long-root && 500 + echo a >r5/subdir/short-subdir && 501 + echo abcde >r5/subdir/long-subdir && 502 + 503 + git -C r5 add short-root long-root subdir && 504 + git -C r5 commit -m "commit msg" 505 + ' 506 + 507 + test_expect_success 'verify collecting omits in combined: filter' ' 508 + # Note that this test guards against the naive implementation of simply 509 + # giving both filters the same "omits" set and expecting it to 510 + # automatically merge them. 511 + git -C r5 rev-list --objects --quiet --filter-print-omitted \ 512 + --filter=combine:tree:2+blob:limit=3 HEAD >actual && 513 + 514 + # Expect 0 trees/commits, 3 blobs omitted (all blobs except short-root) 515 + omitted_1=$(echo 12345 | git hash-object --stdin) && 516 + omitted_2=$(echo a | git hash-object --stdin) && 517 + omitted_3=$(echo abcde | git hash-object --stdin) && 518 + 519 + grep ~$omitted_1 actual && 520 + grep ~$omitted_2 actual && 521 + grep ~$omitted_3 actual && 522 + test_line_count = 3 actual 374 523 ' 375 524 376 525 # Test tree:<depth> where a tree is iterated to twice - once where a subentry is
+6
url.c
··· 86 86 return url_decode_internal(&url, len, NULL, &out, 0); 87 87 } 88 88 89 + char *url_percent_decode(const char *encoded) 90 + { 91 + struct strbuf out = STRBUF_INIT; 92 + return url_decode_internal(&encoded, strlen(encoded), NULL, &out, 0); 93 + } 94 + 89 95 char *url_decode_parameter_name(const char **query) 90 96 { 91 97 struct strbuf out = STRBUF_INIT;
+8
url.h
··· 7 7 int is_urlschemechar(int first_flag, int ch); 8 8 char *url_decode(const char *url); 9 9 char *url_decode_mem(const char *url, int len); 10 + 11 + /* 12 + * Similar to the url_decode_{,mem} methods above, but doesn't assume there 13 + * is a scheme followed by a : at the start of the string. Instead, %-sequences 14 + * before any : are also parsed. 15 + */ 16 + char *url_percent_decode(const char *encoded); 17 + 10 18 char *url_decode_parameter_name(const char **query); 11 19 char *url_decode_parameter_value(const char **query); 12 20