Git fork

Merge branch 'kb/name-hash'

The code to keep track of what directory names are known to Git on
platforms with case insensitive filesystems can get confused upon
a hash collision between these pathnames and looped forever.

* kb/name-hash:
name-hash.c: fix endless loop with core.ignorecase=true

+166 -62
+3 -14
cache.h
··· 132 132 unsigned int ce_namelen; 133 133 unsigned char sha1[20]; 134 134 struct cache_entry *next; 135 - struct cache_entry *dir_next; 136 135 char name[FLEX_ARRAY]; /* more */ 137 136 }; 138 137 ··· 268 267 unsigned name_hash_initialized : 1, 269 268 initialized : 1; 270 269 struct hash_table name_hash; 270 + struct hash_table dir_hash; 271 271 }; 272 272 273 273 extern struct index_state the_index; 274 274 275 275 /* Name hashing */ 276 276 extern void add_name_hash(struct index_state *istate, struct cache_entry *ce); 277 - /* 278 - * We don't actually *remove* it, we can just mark it invalid so that 279 - * we won't find it in lookups. 280 - * 281 - * Not only would we have to search the lists (simple enough), but 282 - * we'd also have to rehash other hash buckets in case this makes the 283 - * hash bucket empty (common). So it's much better to just mark 284 - * it. 285 - */ 286 - static inline void remove_name_hash(struct cache_entry *ce) 287 - { 288 - ce->ce_flags |= CE_UNHASHED; 289 - } 277 + extern void remove_name_hash(struct index_state *istate, struct cache_entry *ce); 278 + extern void free_name_hash(struct index_state *istate); 290 279 291 280 292 281 #ifndef NO_THE_INDEX_COMPATIBILITY_MACROS
+139 -43
name-hash.c
··· 32 32 return hash; 33 33 } 34 34 35 - static void hash_index_entry_directories(struct index_state *istate, struct cache_entry *ce) 35 + struct dir_entry { 36 + struct dir_entry *next; 37 + struct dir_entry *parent; 38 + struct cache_entry *ce; 39 + int nr; 40 + unsigned int namelen; 41 + }; 42 + 43 + static struct dir_entry *find_dir_entry(struct index_state *istate, 44 + const char *name, unsigned int namelen) 45 + { 46 + unsigned int hash = hash_name(name, namelen); 47 + struct dir_entry *dir; 48 + 49 + for (dir = lookup_hash(hash, &istate->dir_hash); dir; dir = dir->next) 50 + if (dir->namelen == namelen && 51 + !strncasecmp(dir->ce->name, name, namelen)) 52 + return dir; 53 + return NULL; 54 + } 55 + 56 + static struct dir_entry *hash_dir_entry(struct index_state *istate, 57 + struct cache_entry *ce, int namelen) 36 58 { 37 59 /* 38 60 * Throw each directory component in the hash for quick lookup 39 61 * during a git status. Directory components are stored with their 40 62 * closing slash. Despite submodules being a directory, they never 41 63 * reach this point, because they are stored without a closing slash 42 - * in the cache. 64 + * in index_state.name_hash (as ordinary cache_entries). 43 65 * 44 - * Note that the cache_entry stored with the directory does not 45 - * represent the directory itself. It is a pointer to an existing 46 - * filename, and its only purpose is to represent existence of the 47 - * directory in the cache. It is very possible multiple directory 48 - * hash entries may point to the same cache_entry. 66 + * Note that the cache_entry stored with the dir_entry merely 67 + * supplies the name of the directory (up to dir_entry.namelen). We 68 + * track the number of 'active' files in a directory in dir_entry.nr, 69 + * so we can tell if the directory is still relevant, e.g. for git 70 + * status. However, if cache_entries are removed, we cannot pinpoint 71 + * an exact cache_entry that's still active. It is very possible that 72 + * multiple dir_entries point to the same cache_entry. 49 73 */ 50 - unsigned int hash; 51 - void **pos; 74 + struct dir_entry *dir; 75 + 76 + /* get length of parent directory */ 77 + while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1])) 78 + namelen--; 79 + if (namelen <= 0) 80 + return NULL; 81 + 82 + /* lookup existing entry for that directory */ 83 + dir = find_dir_entry(istate, ce->name, namelen); 84 + if (!dir) { 85 + /* not found, create it and add to hash table */ 86 + void **pdir; 87 + unsigned int hash = hash_name(ce->name, namelen); 88 + 89 + dir = xcalloc(1, sizeof(struct dir_entry)); 90 + dir->namelen = namelen; 91 + dir->ce = ce; 52 92 53 - const char *ptr = ce->name; 54 - while (*ptr) { 55 - while (*ptr && *ptr != '/') 56 - ++ptr; 57 - if (*ptr == '/') { 58 - ++ptr; 59 - hash = hash_name(ce->name, ptr - ce->name); 60 - pos = insert_hash(hash, ce, &istate->name_hash); 61 - if (pos) { 62 - ce->dir_next = *pos; 63 - *pos = ce; 64 - } 93 + pdir = insert_hash(hash, dir, &istate->dir_hash); 94 + if (pdir) { 95 + dir->next = *pdir; 96 + *pdir = dir; 65 97 } 98 + 99 + /* recursively add missing parent directories */ 100 + dir->parent = hash_dir_entry(istate, ce, namelen - 1); 66 101 } 102 + return dir; 103 + } 104 + 105 + static void add_dir_entry(struct index_state *istate, struct cache_entry *ce) 106 + { 107 + /* Add reference to the directory entry (and parents if 0). */ 108 + struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce)); 109 + while (dir && !(dir->nr++)) 110 + dir = dir->parent; 111 + } 112 + 113 + static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce) 114 + { 115 + /* 116 + * Release reference to the directory entry (and parents if 0). 117 + * 118 + * Note: we do not remove / free the entry because there's no 119 + * hash.[ch]::remove_hash and dir->next may point to other entries 120 + * that are still valid, so we must not free the memory. 121 + */ 122 + struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce)); 123 + while (dir && dir->nr && !(--dir->nr)) 124 + dir = dir->parent; 67 125 } 68 126 69 127 static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) ··· 74 132 if (ce->ce_flags & CE_HASHED) 75 133 return; 76 134 ce->ce_flags |= CE_HASHED; 77 - ce->next = ce->dir_next = NULL; 135 + ce->next = NULL; 78 136 hash = hash_name(ce->name, ce_namelen(ce)); 79 137 pos = insert_hash(hash, ce, &istate->name_hash); 80 138 if (pos) { ··· 82 140 *pos = ce; 83 141 } 84 142 85 - if (ignore_case) 86 - hash_index_entry_directories(istate, ce); 143 + if (ignore_case && !(ce->ce_flags & CE_UNHASHED)) 144 + add_dir_entry(istate, ce); 87 145 } 88 146 89 147 static void lazy_init_name_hash(struct index_state *istate) ··· 101 159 102 160 void add_name_hash(struct index_state *istate, struct cache_entry *ce) 103 161 { 162 + /* if already hashed, add reference to directory entries */ 163 + if (ignore_case && (ce->ce_flags & CE_STATE_MASK) == CE_STATE_MASK) 164 + add_dir_entry(istate, ce); 165 + 104 166 ce->ce_flags &= ~CE_UNHASHED; 105 167 if (istate->name_hash_initialized) 106 168 hash_index_entry(istate, ce); 107 169 } 108 170 171 + /* 172 + * We don't actually *remove* it, we can just mark it invalid so that 173 + * we won't find it in lookups. 174 + * 175 + * Not only would we have to search the lists (simple enough), but 176 + * we'd also have to rehash other hash buckets in case this makes the 177 + * hash bucket empty (common). So it's much better to just mark 178 + * it. 179 + */ 180 + void remove_name_hash(struct index_state *istate, struct cache_entry *ce) 181 + { 182 + /* if already hashed, release reference to directory entries */ 183 + if (ignore_case && (ce->ce_flags & CE_STATE_MASK) == CE_HASHED) 184 + remove_dir_entry(istate, ce); 185 + 186 + ce->ce_flags |= CE_UNHASHED; 187 + } 188 + 109 189 static int slow_same_name(const char *name1, int len1, const char *name2, int len2) 110 190 { 111 191 if (len1 != len2) ··· 139 219 if (!icase) 140 220 return 0; 141 221 142 - /* 143 - * If the entry we're comparing is a filename (no trailing slash), then compare 144 - * the lengths exactly. 145 - */ 146 - if (name[namelen - 1] != '/') 147 - return slow_same_name(name, namelen, ce->name, len); 148 - 149 - /* 150 - * For a directory, we point to an arbitrary cache_entry filename. Just 151 - * make sure the directory portion matches. 152 - */ 153 - return slow_same_name(name, namelen, ce->name, namelen < len ? namelen : len); 222 + return slow_same_name(name, namelen, ce->name, len); 154 223 } 155 224 156 225 struct cache_entry *index_name_exists(struct index_state *istate, const char *name, int namelen, int icase) ··· 166 235 if (same_name(ce, name, namelen, icase)) 167 236 return ce; 168 237 } 169 - if (icase && name[namelen - 1] == '/') 170 - ce = ce->dir_next; 171 - else 172 - ce = ce->next; 238 + ce = ce->next; 173 239 } 174 240 175 241 /* 176 - * Might be a submodule. Despite submodules being directories, 242 + * When looking for a directory (trailing '/'), it might be a 243 + * submodule or a directory. Despite submodules being directories, 177 244 * they are stored in the name hash without a closing slash. 178 - * When ignore_case is 1, directories are stored in the name hash 179 - * with their closing slash. 245 + * When ignore_case is 1, directories are stored in a separate hash 246 + * table *with* their closing slash. 180 247 * 181 248 * The side effect of this storage technique is we have need to 249 + * lookup the directory in a separate hash table, and if not found 182 250 * remove the slash from name and perform the lookup again without 183 251 * the slash. If a match is made, S_ISGITLINK(ce->mode) will be 184 252 * true. 185 253 */ 186 254 if (icase && name[namelen - 1] == '/') { 255 + struct dir_entry *dir = find_dir_entry(istate, name, namelen); 256 + if (dir && dir->nr) 257 + return dir->ce; 258 + 187 259 ce = index_name_exists(istate, name, namelen - 1, icase); 188 260 if (ce && S_ISGITLINK(ce->ce_mode)) 189 261 return ce; 190 262 } 191 263 return NULL; 192 264 } 265 + 266 + static int free_dir_entry(void *entry, void *unused) 267 + { 268 + struct dir_entry *dir = entry; 269 + while (dir) { 270 + struct dir_entry *next = dir->next; 271 + free(dir); 272 + dir = next; 273 + } 274 + return 0; 275 + } 276 + 277 + void free_name_hash(struct index_state *istate) 278 + { 279 + if (!istate->name_hash_initialized) 280 + return; 281 + istate->name_hash_initialized = 0; 282 + if (ignore_case) 283 + /* free directory entries */ 284 + for_each_hash(&istate->dir_hash, free_dir_entry, NULL); 285 + 286 + free_hash(&istate->name_hash); 287 + free_hash(&istate->dir_hash); 288 + }
+4 -5
read-cache.c
··· 46 46 { 47 47 struct cache_entry *old = istate->cache[nr]; 48 48 49 - remove_name_hash(old); 49 + remove_name_hash(istate, old); 50 50 set_index_entry(istate, nr, ce); 51 51 istate->cache_changed = 1; 52 52 } ··· 460 460 struct cache_entry *ce = istate->cache[pos]; 461 461 462 462 record_resolve_undo(istate, ce); 463 - remove_name_hash(ce); 463 + remove_name_hash(istate, ce); 464 464 istate->cache_changed = 1; 465 465 istate->cache_nr--; 466 466 if (pos >= istate->cache_nr) ··· 483 483 484 484 for (i = j = 0; i < istate->cache_nr; i++) { 485 485 if (ce_array[i]->ce_flags & CE_REMOVE) 486 - remove_name_hash(ce_array[i]); 486 + remove_name_hash(istate, ce_array[i]); 487 487 else 488 488 ce_array[j++] = ce_array[i]; 489 489 } ··· 1515 1515 istate->cache_changed = 0; 1516 1516 istate->timestamp.sec = 0; 1517 1517 istate->timestamp.nsec = 0; 1518 - istate->name_hash_initialized = 0; 1519 - free_hash(&istate->name_hash); 1518 + free_name_hash(istate); 1520 1519 cache_tree_free(&(istate->cache_tree)); 1521 1520 istate->initialized = 0; 1522 1521
+20
t/t7062-wtstatus-ignorecase.sh
··· 1 + #!/bin/sh 2 + 3 + test_description='git-status with core.ignorecase=true' 4 + 5 + . ./test-lib.sh 6 + 7 + test_expect_success 'status with hash collisions' ' 8 + # note: "V/", "V/XQANY/" and "WURZAUP/" produce the same hash code 9 + # in name-hash.c::hash_name 10 + mkdir V && 11 + mkdir V/XQANY && 12 + mkdir WURZAUP && 13 + touch V/XQANY/test && 14 + git config core.ignorecase true && 15 + git add . && 16 + # test is successful if git status completes (no endless loop) 17 + git status 18 + ' 19 + 20 + test_done