A monorepo management tool for the agentic ages
at main 4989 lines 212 kB view raw
1#!/usr/bin/env python3 2 3""" 4git-filter-repo filters git repositories, similar to git filter-branch, BFG 5repo cleaner, and others. The basic idea is that it works by running 6 git fast-export <options> | filter | git fast-import <options> 7where this program not only launches the whole pipeline but also serves as 8the 'filter' in the middle. It does a few additional things on top as well 9in order to make it into a well-rounded filtering tool. 10 11git-filter-repo can also be used as a library for more involved filtering 12operations; however: 13 ***** API BACKWARD COMPATIBILITY CAVEAT ***** 14 Programs using git-filter-repo as a library can reach pretty far into its 15 internals, but I am not prepared to guarantee backward compatibility of 16 all APIs. I suspect changes will be rare, but I reserve the right to 17 change any API. Since it is assumed that repository filtering is 18 something one would do very rarely, and in particular that it's a 19 one-shot operation, this should not be a problem in practice for anyone. 20 However, if you want to re-use a program you have written that uses 21 git-filter-repo as a library (or makes use of one of its --*-callback 22 arguments), you should either make sure you are using the same version of 23 git and git-filter-repo, or make sure to re-test it. 24 25 If there are particular pieces of the API you are concerned about, and 26 there is not already a testcase for it in t9391-lib-usage.sh or 27 t9392-python-callback.sh, please contribute a testcase. That will not 28 prevent me from changing the API, but it will allow you to look at the 29 history of a testcase to see whether and how the API changed. 30 ***** END API BACKWARD COMPATIBILITY CAVEAT ***** 31""" 32 33import argparse 34import collections 35import fnmatch 36import gettext 37import io 38import os 39import platform 40import re 41import shutil 42import subprocess 43import sys 44import time 45import textwrap 46 47from datetime import tzinfo, timedelta, datetime 48 49__all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress", 50 "Checkpoint", "FastExportParser", "ProgressWriter", 51 "string_to_date", "date_to_string", 52 "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"] 53 54# The globals to make visible to callbacks. They will see all our imports for 55# free, as well as our public API. 56public_globals = ["__builtins__", "argparse", "collections", "fnmatch", 57 "gettext", "io", "os", "platform", "re", "shutil", 58 "subprocess", "sys", "time", "textwrap", "tzinfo", 59 "timedelta", "datetime"] + __all__ 60 61deleted_hash = b'0'*40 62write_marks = True 63date_format_permissive = True 64 65def gettext_poison(msg): 66 if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover 67 return "# GETTEXT POISON #" 68 return gettext.gettext(msg) 69 70_ = gettext_poison 71 72def setup_gettext(): 73 TEXTDOMAIN="git-filter-repo" 74 podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@" 75 if not os.path.isdir(podir): # pragma: no cover 76 podir = None # Python has its own fallback; use that 77 78 ## This looks like the most straightforward translation of the relevant 79 ## code in git.git:gettext.c and git.git:perl/Git/I18n.pm: 80 #import locale 81 #locale.setlocale(locale.LC_MESSAGES, ""); 82 #locale.setlocale(locale.LC_TIME, ""); 83 #locale.textdomain(TEXTDOMAIN); 84 #locale.bindtextdomain(TEXTDOMAIN, podir); 85 ## but the python docs suggest using the gettext module (which doesn't 86 ## have setlocale()) instead, so: 87 gettext.textdomain(TEXTDOMAIN); 88 gettext.bindtextdomain(TEXTDOMAIN, podir); 89 90def _timedelta_to_seconds(delta): 91 """ 92 Converts timedelta to seconds 93 """ 94 offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000 95 return round(offset) 96 97class FixedTimeZone(tzinfo): 98 """ 99 Fixed offset in minutes east from UTC. 100 """ 101 102 tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$') 103 104 def __init__(self, offset_string): 105 tzinfo.__init__(self) 106 sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups() 107 factor = -1 if (sign and sign == b'-') else 1 108 self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm))) 109 self._offset_string = offset_string 110 111 def utcoffset(self, dt): 112 return self._offset 113 114 def tzname(self, dt): 115 return self._offset_string 116 117 def dst(self, dt): 118 return timedelta(0) 119 120def string_to_date(datestring): 121 (unix_timestamp, tz_offset) = datestring.split() 122 return datetime.fromtimestamp(int(unix_timestamp), 123 FixedTimeZone(tz_offset)) 124 125def date_to_string(dateobj): 126 epoch = datetime.fromtimestamp(0, dateobj.tzinfo) 127 return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)), 128 dateobj.tzinfo.tzname(0))) 129 130def decode(bytestr): 131 'Try to convert bytestr to utf-8 for outputting as an error message.' 132 return bytestr.decode('utf-8', 'backslashreplace') 133 134def glob_to_regex(glob_bytestr): 135 'Translate glob_bytestr into a regex on bytestrings' 136 137 # fnmatch.translate is idiotic and won't accept bytestrings 138 if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover 139 raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr)) 140 141 # Create regex operating on string 142 regex = fnmatch.translate(decode(glob_bytestr)) 143 144 # FIXME: This is an ugly hack... 145 # fnmatch.translate tries to do multi-line matching and wants the glob to 146 # match up to the end of the input, which isn't relevant for us, so we 147 # have to modify the regex. fnmatch.translate has used different regex 148 # constructs to achieve this with different python versions, so we have 149 # to check for each of them and then fix it up. It would be much better 150 # if fnmatch.translate could just take some flags to allow us to specify 151 # what we want rather than employing this hackery, but since it 152 # doesn't... 153 if regex.endswith(r'\Z(?ms)'): # pragma: no cover 154 regex = regex[0:-7] 155 elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover 156 regex = regex[4:-3] 157 elif regex.startswith(r'(?s:') and regex.endswith(r')\z'): # pragma: no cover 158 # Yaay, python3.14 for senselessly duplicating \Z as \z... 159 regex = regex[4:-3] 160 161 # Finally, convert back to regex operating on bytestr 162 return regex.encode() 163 164class PathQuoting: 165 _unescape = {b'a': b'\a', 166 b'b': b'\b', 167 b'f': b'\f', 168 b'n': b'\n', 169 b'r': b'\r', 170 b't': b'\t', 171 b'v': b'\v', 172 b'"': b'"', 173 b'\\':b'\\'} 174 _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})') 175 _escape = [bytes([x]) for x in range(127)]+[ 176 b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)] 177 _reverse = dict(map(reversed, _unescape.items())) 178 for x in _reverse: 179 _escape[ord(x)] = b'\\'+_reverse[x] 180 _special_chars = [len(x) > 1 for x in _escape] 181 182 @staticmethod 183 def unescape_sequence(orig): 184 seq = orig.group(1) 185 return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)]) 186 187 @staticmethod 188 def dequote(quoted_string): 189 if quoted_string.startswith(b'"'): 190 assert quoted_string.endswith(b'"') 191 return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence, 192 quoted_string[1:-1]) 193 return quoted_string 194 195 @staticmethod 196 def enquote(unquoted_string): 197 # Option 1: Quoting when fast-export would: 198 # pqsc = PathQuoting._special_chars 199 # if any(pqsc[x] for x in set(unquoted_string)): 200 # Option 2, perf hack: do minimal amount of quoting required by fast-import 201 if unquoted_string.startswith(b'"') or b'\n' in unquoted_string: 202 pqe = PathQuoting._escape 203 return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"' 204 return unquoted_string 205 206class AncestryGraph(object): 207 """ 208 A class that maintains a direct acycle graph of commits for the purpose of 209 determining if one commit is the ancestor of another. 210 211 A note about identifiers in Commit objects: 212 * Commit objects have 2 identifiers: commit.old_id and commit.id, because: 213 * The original fast-export stream identified commits by an identifier. 214 This is often an integer, but is sometimes a hash (particularly when 215 --reference-excluded-parents is provided) 216 * The new fast-import stream we use may not use the same identifiers. 217 If new blobs or commits are inserted (such as lint-history does), then 218 the integer (or hash) are no longer valid. 219 220 A note about identifiers in AncestryGraph objects, of which there are three: 221 * A given AncestryGraph is based on either commit.old_id or commit.id, but 222 not both. These are the keys for self.value. 223 * Using full hashes (occasionally) for children in self.graph felt 224 wasteful, so we use our own internal integer within self.graph. 225 self.value maps from commit {old_}id to our internal integer id. 226 * When working with commit.old_id, it is also sometimes useful to be able 227 to map these to the original hash, i.e. commit.original_id. So, we 228 also have self.git_hash for mapping from commit.old_id to git's commit 229 hash. 230 """ 231 232 def __init__(self): 233 # The next internal identifier we will use; increments with every commit 234 # added to the AncestryGraph 235 self.cur_value = 0 236 237 # A mapping from the external identifers given to us to the simple integers 238 # we use in self.graph 239 self.value = {} 240 241 # A tuple of (depth, list-of-ancestors). Values and keys in this graph are 242 # all integers from the (values of the) self.value dict. The depth of a 243 # commit is one more than the max depth of any of its ancestors. 244 self.graph = {} 245 246 # A mapping from external identifier (i.e. from the keys of self.value) to 247 # the hash of the given commit. Only populated for graphs based on 248 # commit.old_id, since we won't know until later what the git_hash for 249 # graphs based on commit.id (since we have to wait for fast-import to 250 # create the commit and notify us of its hash; see _pending_renames). 251 # elsewhere 252 self.git_hash = {} 253 254 # Reverse maps; only populated if needed. Caller responsible to check 255 # and ensure they are populated 256 self._reverse_value = {} 257 self._hash_to_id = {} 258 259 # Cached results from previous calls to is_ancestor(). 260 self._cached_is_ancestor = {} 261 262 def record_external_commits(self, external_commits): 263 """ 264 Record in graph that each commit in external_commits exists, and is 265 treated as a root commit with no parents. 266 """ 267 for c in external_commits: 268 if c not in self.value: 269 self.cur_value += 1 270 self.value[c] = self.cur_value 271 self.graph[self.cur_value] = (1, []) 272 self.git_hash[c] = c 273 274 def add_commit_and_parents(self, commit, parents, githash = None): 275 """ 276 Record in graph that commit has the given parents (all identified by 277 fast export stream identifiers, usually integers but sometimes hashes). 278 parents _MUST_ have been first recorded. commit _MUST_ not have been 279 recorded yet. Also, record the mapping between commit and githash, if 280 githash is given. 281 """ 282 assert all(p in self.value for p in parents) 283 assert commit not in self.value 284 285 # Get values for commit and parents 286 self.cur_value += 1 287 self.value[commit] = self.cur_value 288 if githash: 289 self.git_hash[commit] = githash 290 graph_parents = [self.value[x] for x in parents] 291 292 # Determine depth for commit, then insert the info into the graph 293 depth = 1 294 if parents: 295 depth += max(self.graph[p][0] for p in graph_parents) 296 self.graph[self.cur_value] = (depth, graph_parents) 297 298 def record_hash(self, commit_id, githash): 299 ''' 300 If a githash was not recorded for commit_id, when add_commit_and_parents 301 was called, add it now. 302 ''' 303 assert commit_id in self.value 304 assert commit_id not in self.git_hash 305 self.git_hash[commit_id] = githash 306 307 def _ensure_reverse_maps_populated(self): 308 if not self._hash_to_id: 309 assert not self._reverse_value 310 self._hash_to_id = {v: k for k, v in self.git_hash.items()} 311 self._reverse_value = {v: k for k, v in self.value.items()} 312 313 def get_parent_hashes(self, commit_hash): 314 ''' 315 Given a commit_hash, return its parents hashes 316 ''' 317 # 318 # We have to map: 319 # commit hash -> fast export stream id -> graph id 320 # then lookup 321 # parent graph ids for given graph id 322 # then we need to map 323 # parent graph ids -> parent fast export ids -> parent commit hashes 324 # 325 self._ensure_reverse_maps_populated() 326 commit_fast_export_id = self._hash_to_id[commit_hash] 327 commit_graph_id = self.value[commit_fast_export_id] 328 parent_graph_ids = self.graph[commit_graph_id][1] 329 parent_fast_export_ids = [self._reverse_value[x] for x in parent_graph_ids] 330 parent_hashes = [self.git_hash[x] for x in parent_fast_export_ids] 331 return parent_hashes 332 333 def map_to_hash(self, commit_id): 334 ''' 335 Given a commit (by fast export stream id), return its hash 336 ''' 337 return self.git_hash.get(commit_id, None) 338 339 def is_ancestor(self, possible_ancestor, check): 340 """ 341 Return whether possible_ancestor is an ancestor of check 342 """ 343 a, b = self.value[possible_ancestor], self.value[check] 344 original_pair = (a,b) 345 a_depth = self.graph[a][0] 346 ancestors = [b] 347 visited = set() 348 while ancestors: 349 ancestor = ancestors.pop() 350 prev_pair = (a, ancestor) 351 if prev_pair in self._cached_is_ancestor: 352 if not self._cached_is_ancestor[prev_pair]: 353 continue 354 self._cached_is_ancestor[original_pair] = True 355 return True 356 if ancestor in visited: 357 continue 358 visited.add(ancestor) 359 depth, more_ancestors = self.graph[ancestor] 360 if ancestor == a: 361 self._cached_is_ancestor[original_pair] = True 362 return True 363 elif depth <= a_depth: 364 continue 365 ancestors.extend(more_ancestors) 366 self._cached_is_ancestor[original_pair] = False 367 return False 368 369class MailmapInfo(object): 370 def __init__(self, filename): 371 self.changes = {} 372 self._parse_file(filename) 373 374 def _parse_file(self, filename): 375 name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*') 376 comment_re = re.compile(br'\s*#.*') 377 if not os.access(filename, os.R_OK): 378 raise SystemExit(_("Cannot read %s") % decode(filename)) 379 with open(filename, 'br') as f: 380 count = 0 381 for line in f: 382 count += 1 383 err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line) 384 # Remove comments 385 line = comment_re.sub(b'', line) 386 # Remove leading and trailing whitespace 387 line = line.strip() 388 if not line: 389 continue 390 391 m = name_and_email_re.match(line) 392 if not m: 393 raise SystemExit(err) 394 proper_name, proper_email = m.groups() 395 if len(line) == m.end(): 396 self.changes[(None, proper_email)] = (proper_name, proper_email) 397 continue 398 rest = line[m.end():] 399 m = name_and_email_re.match(rest) 400 if m: 401 commit_name, commit_email = m.groups() 402 if len(rest) != m.end(): 403 raise SystemExit(err) 404 else: 405 commit_name, commit_email = rest, None 406 self.changes[(commit_name, commit_email)] = (proper_name, proper_email) 407 408 def translate(self, name, email): 409 ''' Given a name and email, return the expected new name and email from the 410 mailmap if there is a translation rule for it, otherwise just return 411 the given name and email.''' 412 for old, new in self.changes.items(): 413 old_name, old_email = old 414 new_name, new_email = new 415 if (old_email is None or email.lower() == old_email.lower()) and ( 416 name == old_name or not old_name): 417 return (new_name or name, new_email or email) 418 return (name, email) 419 420class ProgressWriter(object): 421 def __init__(self): 422 self._last_progress_update = time.time() 423 self._last_message = None 424 425 def show(self, msg): 426 self._last_message = msg 427 now = time.time() 428 if now - self._last_progress_update > .1: 429 self._last_progress_update = now 430 sys.stdout.write("\r{}".format(msg)) 431 sys.stdout.flush() 432 433 def finish(self): 434 self._last_progress_update = 0 435 if self._last_message: 436 self.show(self._last_message) 437 sys.stdout.write("\n") 438 439class _IDs(object): 440 """ 441 A class that maintains the 'name domain' of all the 'marks' (short int 442 id for a blob/commit git object). There are two reasons this mechanism 443 is necessary: 444 (1) the output text of fast-export may refer to an object using a different 445 mark than the mark that was assigned to that object using IDS.new(). 446 (This class allows you to translate the fast-export marks, "old" to 447 the marks assigned from IDS.new(), "new"). 448 (2) when we prune a commit, its "old" id becomes invalid. Any commits 449 which had that commit as a parent needs to use the nearest unpruned 450 ancestor as its parent instead. 451 452 Note that for purpose (1) above, this typically comes about because the user 453 manually creates Blob or Commit objects (for insertion into the stream). 454 It could also come about if we attempt to read the data from two different 455 repositories and trying to combine the data (git fast-export will number ids 456 from 1...n, and having two 1's, two 2's, two 3's, causes issues; granted, we 457 this scheme doesn't handle the two streams perfectly either, but if the first 458 fast export stream is entirely processed and handled before the second stream 459 is started, this mechanism may be sufficient to handle it). 460 """ 461 462 def __init__(self): 463 """ 464 Init 465 """ 466 # The id for the next created blob/commit object 467 self._next_id = 1 468 469 # A map of old-ids to new-ids (1:1 map) 470 self._translation = {} 471 472 # A map of new-ids to every old-id that points to the new-id (1:N map) 473 self._reverse_translation = {} 474 475 def has_renames(self): 476 """ 477 Return whether there have been ids remapped to new values 478 """ 479 return bool(self._translation) 480 481 def new(self): 482 """ 483 Should be called whenever a new blob or commit object is created. The 484 returned value should be used as the id/mark for that object. 485 """ 486 rv = self._next_id 487 self._next_id += 1 488 return rv 489 490 def record_rename(self, old_id, new_id, handle_transitivity = False): 491 """ 492 Record that old_id is being renamed to new_id. 493 """ 494 if old_id != new_id or old_id in self._translation: 495 # old_id -> new_id 496 self._translation[old_id] = new_id 497 498 # Transitivity will be needed if new commits are being inserted mid-way 499 # through a branch. 500 if handle_transitivity: 501 # Anything that points to old_id should point to new_id 502 if old_id in self._reverse_translation: 503 for id_ in self._reverse_translation[old_id]: 504 self._translation[id_] = new_id 505 506 # Record that new_id is pointed to by old_id 507 if new_id not in self._reverse_translation: 508 self._reverse_translation[new_id] = [] 509 self._reverse_translation[new_id].append(old_id) 510 511 def translate(self, old_id): 512 """ 513 If old_id has been mapped to an alternate id, return the alternate id. 514 """ 515 if old_id in self._translation: 516 return self._translation[old_id] 517 else: 518 return old_id 519 520 def __str__(self): 521 """ 522 Convert IDs to string; used for debugging 523 """ 524 rv = "Current count: %d\nTranslation:\n" % self._next_id 525 for k in sorted(self._translation): 526 rv += " %d -> %s\n" % (k, self._translation[k]) 527 528 rv += "Reverse translation:\n" 529 reverse_keys = list(self._reverse_translation.keys()) 530 if None in reverse_keys: # pragma: no cover 531 reverse_keys.remove(None) 532 reverse_keys = sorted(reverse_keys) 533 reverse_keys.append(None) 534 for k in reverse_keys: 535 rv += " " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n" 536 537 return rv 538 539class _GitElement(object): 540 """ 541 The base class for all git elements that we create. 542 """ 543 544 def __init__(self): 545 # A string that describes what type of Git element this is 546 self.type = None 547 548 # A flag telling us if this Git element has been dumped 549 # (i.e. printed) or skipped. Typically elements that have been 550 # dumped or skipped will not be dumped again. 551 self.dumped = 0 552 553 def dump(self, file_): 554 """ 555 This version should never be called. Derived classes need to 556 override! We should note that subclasses should implement this 557 method such that the output would match the format produced by 558 fast-export. 559 """ 560 raise SystemExit(_("Unimplemented function: %s") % type(self).__name__ 561 +".dump()") # pragma: no cover 562 563 def __bytes__(self): 564 """ 565 Convert GitElement to bytestring; used for debugging 566 """ 567 old_dumped = self.dumped 568 writeme = io.BytesIO() 569 self.dump(writeme) 570 output_lines = writeme.getvalue().splitlines() 571 writeme.close() 572 self.dumped = old_dumped 573 return b"%s:\n %s" % (type(self).__name__.encode(), 574 b"\n ".join(output_lines)) 575 576 def skip(self, new_id=None): 577 """ 578 Ensures this element will not be written to output 579 """ 580 self.dumped = 2 581 582class _GitElementWithId(_GitElement): 583 """ 584 The base class for Git elements that have IDs (commits and blobs) 585 """ 586 587 def __init__(self): 588 _GitElement.__init__(self) 589 590 # The mark (short, portable id) for this element 591 self.id = _IDS.new() 592 593 # The previous mark for this element 594 self.old_id = None 595 596 def skip(self, new_id=None): 597 """ 598 This element will no longer be automatically written to output. When a 599 commit gets skipped, it's ID will need to be translated to that of its 600 parent. 601 """ 602 self.dumped = 2 603 604 _IDS.record_rename(self.old_id or self.id, new_id) 605 606class Blob(_GitElementWithId): 607 """ 608 This class defines our representation of git blob elements (i.e. our 609 way of representing file contents). 610 """ 611 612 def __init__(self, data, original_id = None): 613 _GitElementWithId.__init__(self) 614 615 # Denote that this is a blob 616 self.type = 'blob' 617 618 # Record original id 619 self.original_id = original_id 620 621 # Stores the blob's data 622 assert(type(data) == bytes) 623 self.data = data 624 625 def dump(self, file_): 626 """ 627 Write this blob element to a file. 628 """ 629 self.dumped = 1 630 BLOB_HASH_TO_NEW_ID[self.original_id] = self.id 631 BLOB_NEW_ID_TO_HASH[self.id] = self.original_id 632 633 file_.write(b'blob\n') 634 file_.write(b'mark :%d\n' % self.id) 635 file_.write(b'data %d\n%s' % (len(self.data), self.data)) 636 file_.write(b'\n') 637 638 639class Reset(_GitElement): 640 """ 641 This class defines our representation of git reset elements. A reset 642 event is the creation (or recreation) of a named branch, optionally 643 starting from a specific revision). 644 """ 645 646 def __init__(self, ref, from_ref = None): 647 _GitElement.__init__(self) 648 649 # Denote that this is a reset 650 self.type = 'reset' 651 652 # The name of the branch being (re)created 653 self.ref = ref 654 655 # Some reference to the branch/commit we are resetting from 656 self.from_ref = from_ref 657 658 def dump(self, file_): 659 """ 660 Write this reset element to a file 661 """ 662 self.dumped = 1 663 664 file_.write(b'reset %s\n' % self.ref) 665 if self.from_ref: 666 if isinstance(self.from_ref, int): 667 file_.write(b'from :%d\n' % self.from_ref) 668 else: 669 file_.write(b'from %s\n' % self.from_ref) 670 file_.write(b'\n') 671 672class FileChange(_GitElement): 673 """ 674 This class defines our representation of file change elements. File change 675 elements are components within a Commit element. 676 """ 677 678 def __init__(self, type_, filename = None, id_ = None, mode = None): 679 _GitElement.__init__(self) 680 681 # Denote the type of file-change (b'M' for modify, b'D' for delete, etc) 682 # We could 683 # assert(type(type_) == bytes) 684 # here but I don't just due to worries about performance overhead... 685 self.type = type_ 686 687 # Record the name of the file being changed 688 self.filename = filename 689 690 # Record the mode (mode describes type of file entry (non-executable, 691 # executable, or symlink)). 692 self.mode = mode 693 694 # blob_id is the id (mark) of the affected blob 695 self.blob_id = id_ 696 697 if type_ == b'DELETEALL': 698 assert filename is None and id_ is None and mode is None 699 self.filename = b'' # Just so PathQuoting.enquote doesn't die 700 else: 701 assert filename is not None 702 703 if type_ == b'M': 704 assert id_ is not None and mode is not None 705 elif type_ == b'D': 706 assert id_ is None and mode is None 707 elif type_ == b'R': # pragma: no cover (now avoid fast-export renames) 708 assert mode is None 709 if id_ is None: 710 raise SystemExit(_("new name needed for rename of %s") % filename) 711 self.filename = (self.filename, id_) 712 self.blob_id = None 713 714 def dump(self, file_): 715 """ 716 Write this file-change element to a file 717 """ 718 skipped_blob = (self.type == b'M' and self.blob_id is None) 719 if skipped_blob: return 720 self.dumped = 1 721 722 quoted_filename = PathQuoting.enquote(self.filename) 723 if self.type == b'M' and isinstance(self.blob_id, int): 724 file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename)) 725 elif self.type == b'M': 726 file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename)) 727 elif self.type == b'D': 728 file_.write(b'D %s\n' % quoted_filename) 729 elif self.type == b'DELETEALL': 730 file_.write(b'deleteall\n') 731 else: 732 raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover 733 734class Commit(_GitElementWithId): 735 """ 736 This class defines our representation of commit elements. Commit elements 737 contain all the information associated with a commit. 738 """ 739 740 def __init__(self, branch, 741 author_name, author_email, author_date, 742 committer_name, committer_email, committer_date, 743 message, 744 file_changes, 745 parents, 746 original_id = None, 747 encoding = None, # encoding for message; None implies UTF-8 748 **kwargs): 749 _GitElementWithId.__init__(self) 750 self.old_id = self.id 751 752 # Denote that this is a commit element 753 self.type = 'commit' 754 755 # Record the affected branch 756 self.branch = branch 757 758 # Record original id 759 self.original_id = original_id 760 761 # Record author's name 762 self.author_name = author_name 763 764 # Record author's email 765 self.author_email = author_email 766 767 # Record date of authoring 768 self.author_date = author_date 769 770 # Record committer's name 771 self.committer_name = committer_name 772 773 # Record committer's email 774 self.committer_email = committer_email 775 776 # Record date the commit was made 777 self.committer_date = committer_date 778 779 # Record commit message and its encoding 780 self.encoding = encoding 781 self.message = message 782 783 # List of file-changes associated with this commit. Note that file-changes 784 # are also represented as git elements 785 self.file_changes = file_changes 786 787 self.parents = parents 788 789 def dump(self, file_): 790 """ 791 Write this commit element to a file. 792 """ 793 self.dumped = 1 794 795 # Make output to fast-import slightly easier for humans to read if the 796 # message has no trailing newline of its own; cosmetic, but a nice touch... 797 extra_newline = b'\n' 798 if self.message.endswith(b'\n') or not (self.parents or self.file_changes): 799 extra_newline = b'' 800 801 if not self.parents: 802 file_.write(b'reset %s\n' % self.branch) 803 file_.write((b'commit %s\n' 804 b'mark :%d\n' 805 b'author %s <%s> %s\n' 806 b'committer %s <%s> %s\n' 807 ) % ( 808 self.branch, self.id, 809 self.author_name, self.author_email, self.author_date, 810 self.committer_name, self.committer_email, self.committer_date 811 )) 812 if self.encoding: 813 file_.write(b'encoding %s\n' % self.encoding) 814 file_.write(b'data %d\n%s%s' % 815 (len(self.message), self.message, extra_newline)) 816 for i, parent in enumerate(self.parents): 817 file_.write(b'from ' if i==0 else b'merge ') 818 if isinstance(parent, int): 819 file_.write(b':%d\n' % parent) 820 else: 821 file_.write(b'%s\n' % parent) 822 for change in self.file_changes: 823 change.dump(file_) 824 if not self.parents and not self.file_changes: 825 # Workaround a bug in pre-git-2.22 versions of fast-import with 826 # the get-mark directive. 827 file_.write(b'\n') 828 file_.write(b'\n') 829 830 def first_parent(self): 831 """ 832 Return first parent commit 833 """ 834 if self.parents: 835 return self.parents[0] 836 return None 837 838 def skip(self, new_id=None): 839 _SKIPPED_COMMITS.add(self.old_id or self.id) 840 _GitElementWithId.skip(self, new_id) 841 842class Tag(_GitElementWithId): 843 """ 844 This class defines our representation of annotated tag elements. 845 """ 846 847 def __init__(self, ref, from_ref, 848 tagger_name, tagger_email, tagger_date, tag_msg, 849 original_id = None): 850 _GitElementWithId.__init__(self) 851 self.old_id = self.id 852 853 # Denote that this is a tag element 854 self.type = 'tag' 855 856 # Store the name of the tag 857 self.ref = ref 858 859 # Store the entity being tagged (this should be a commit) 860 self.from_ref = from_ref 861 862 # Record original id 863 self.original_id = original_id 864 865 # Store the name of the tagger 866 self.tagger_name = tagger_name 867 868 # Store the email of the tagger 869 self.tagger_email = tagger_email 870 871 # Store the date 872 self.tagger_date = tagger_date 873 874 # Store the tag message 875 self.message = tag_msg 876 877 def dump(self, file_): 878 """ 879 Write this tag element to a file 880 """ 881 882 self.dumped = 1 883 884 file_.write(b'tag %s\n' % self.ref) 885 if (write_marks and self.id): 886 file_.write(b'mark :%d\n' % self.id) 887 markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n' 888 file_.write(markfmt % self.from_ref) 889 if self.tagger_name: 890 file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email)) 891 file_.write(self.tagger_date) 892 file_.write(b'\n') 893 file_.write(b'data %d\n%s' % (len(self.message), self.message)) 894 file_.write(b'\n') 895 896class Progress(_GitElement): 897 """ 898 This class defines our representation of progress elements. The progress 899 element only contains a progress message, which is printed by fast-import 900 when it processes the progress output. 901 """ 902 903 def __init__(self, message): 904 _GitElement.__init__(self) 905 906 # Denote that this is a progress element 907 self.type = 'progress' 908 909 # Store the progress message 910 self.message = message 911 912 def dump(self, file_): 913 """ 914 Write this progress element to a file 915 """ 916 self.dumped = 1 917 918 file_.write(b'progress %s\n' % self.message) 919 file_.write(b'\n') 920 921class Checkpoint(_GitElement): 922 """ 923 This class defines our representation of checkpoint elements. These 924 elements represent events which force fast-import to close the current 925 packfile, start a new one, and to save out all current branch refs, tags 926 and marks. 927 """ 928 929 def __init__(self): 930 _GitElement.__init__(self) 931 932 # Denote that this is a checkpoint element 933 self.type = 'checkpoint' 934 935 def dump(self, file_): 936 """ 937 Write this checkpoint element to a file 938 """ 939 self.dumped = 1 940 941 file_.write(b'checkpoint\n') 942 file_.write(b'\n') 943 944class LiteralCommand(_GitElement): 945 """ 946 This class defines our representation of commands. The literal command 947 includes only a single line, and is not processed in any special way. 948 """ 949 950 def __init__(self, line): 951 _GitElement.__init__(self) 952 953 # Denote that this is a literal element 954 self.type = 'literal' 955 956 # Store the command 957 self.line = line 958 959 def dump(self, file_): 960 """ 961 Write this progress element to a file 962 """ 963 self.dumped = 1 964 965 file_.write(self.line) 966 967class Alias(_GitElement): 968 """ 969 This class defines our representation of fast-import alias elements. An 970 alias element is the setting of one mark to the same sha1sum as another, 971 usually because the newer mark corresponded to a pruned commit. 972 """ 973 974 def __init__(self, ref, to_ref): 975 _GitElement.__init__(self) 976 # Denote that this is a reset 977 self.type = 'alias' 978 979 self.ref = ref 980 self.to_ref = to_ref 981 982 def dump(self, file_): 983 """ 984 Write this reset element to a file 985 """ 986 self.dumped = 1 987 988 file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref)) 989 990class FastExportParser(object): 991 """ 992 A class for parsing and handling the output from fast-export. This 993 class allows the user to register callbacks when various types of 994 data are encountered in the fast-export output. The basic idea is that, 995 FastExportParser takes fast-export output, creates the various objects 996 as it encounters them, the user gets to use/modify these objects via 997 callbacks, and finally FastExportParser outputs the modified objects 998 in fast-import format (presumably so they can be used to create a new 999 repo). 1000 """ 1001 1002 def __init__(self, 1003 tag_callback = None, commit_callback = None, 1004 blob_callback = None, progress_callback = None, 1005 reset_callback = None, checkpoint_callback = None, 1006 done_callback = None): 1007 # Members below simply store callback functions for the various git 1008 # elements 1009 self._tag_callback = tag_callback 1010 self._blob_callback = blob_callback 1011 self._reset_callback = reset_callback 1012 self._commit_callback = commit_callback 1013 self._progress_callback = progress_callback 1014 self._checkpoint_callback = checkpoint_callback 1015 self._done_callback = done_callback 1016 1017 # Keep track of which refs appear from the export, and which make it to 1018 # the import (pruning of empty commits, renaming of refs, and creating 1019 # new manual objects and inserting them can cause these to differ). 1020 self._exported_refs = set() 1021 self._imported_refs = set() 1022 1023 # A list of the branches we've seen, plus the last known commit they 1024 # pointed to. An entry in latest_*commit will be deleted if we get a 1025 # reset for that branch. These are used because of fast-import's weird 1026 # decision to allow having an implicit parent via naming the branch 1027 # instead of requiring branches to be specified via 'from' directives. 1028 self._latest_commit = {} 1029 self._latest_orig_commit = {} 1030 1031 # A handle to the input source for the fast-export data 1032 self._input = None 1033 1034 # A handle to the output file for the output we generate (we call dump 1035 # on many of the git elements we create). 1036 self._output = None 1037 1038 # Stores the contents of the current line of input being parsed 1039 self._currentline = '' 1040 1041 # Tracks LFS objects we have found 1042 self._lfs_object_tracker = None 1043 1044 # Compile some regexes and cache those 1045 self._mark_re = re.compile(br'mark :(\d+)\n$') 1046 self._parent_regexes = {} 1047 parent_regex_rules = (br' :(\d+)\n$', br' ([0-9a-f]{40})\n') 1048 for parent_refname in (b'from', b'merge'): 1049 ans = [re.compile(parent_refname+x) for x in parent_regex_rules] 1050 self._parent_regexes[parent_refname] = ans 1051 self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"') 1052 self._refline_regexes = {} 1053 for refline_name in (b'reset', b'commit', b'tag', b'progress'): 1054 self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$') 1055 self._user_regexes = {} 1056 for user in (b'author', b'committer', b'tagger'): 1057 self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$') 1058 1059 def _advance_currentline(self): 1060 """ 1061 Grab the next line of input 1062 """ 1063 self._currentline = self._input.readline() 1064 1065 def _parse_optional_mark(self): 1066 """ 1067 If the current line contains a mark, parse it and advance to the 1068 next line; return None otherwise 1069 """ 1070 mark = None 1071 matches = self._mark_re.match(self._currentline) 1072 if matches: 1073 mark = int(matches.group(1)) 1074 self._advance_currentline() 1075 return mark 1076 1077 def _parse_optional_parent_ref(self, refname): 1078 """ 1079 If the current line contains a reference to a parent commit, then 1080 parse it and advance the current line; otherwise return None. Note 1081 that the name of the reference ('from', 'merge') must match the 1082 refname arg. 1083 """ 1084 orig_baseref, baseref = None, None 1085 rule, altrule = self._parent_regexes[refname] 1086 matches = rule.match(self._currentline) 1087 if matches: 1088 orig_baseref = int(matches.group(1)) 1089 # We translate the parent commit mark to what it needs to be in 1090 # our mark namespace 1091 baseref = _IDS.translate(orig_baseref) 1092 self._advance_currentline() 1093 else: 1094 matches = altrule.match(self._currentline) 1095 if matches: 1096 orig_baseref = matches.group(1) 1097 baseref = orig_baseref 1098 self._advance_currentline() 1099 return orig_baseref, baseref 1100 1101 def _parse_optional_filechange(self): 1102 """ 1103 If the current line contains a file-change object, then parse it 1104 and advance the current line; otherwise return None. We only care 1105 about file changes of type b'M' and b'D' (these are the only types 1106 of file-changes that fast-export will provide). 1107 """ 1108 filechange = None 1109 changetype = self._currentline[0:1] 1110 if changetype == b'M': 1111 (changetype, mode, idnum, path) = self._currentline.split(None, 3) 1112 if idnum[0:1] == b':': 1113 idnum = idnum[1:] 1114 path = path.rstrip(b'\n') 1115 # Check for LFS objects from sources before we might toss this filechange 1116 if mode != b'160000' and self._lfs_object_tracker: 1117 value = int(idnum) if len(idnum) != 40 else idnum 1118 self._lfs_object_tracker.check_file_change_data(value, True) 1119 # We translate the idnum to our id system 1120 if len(idnum) != 40: 1121 idnum = _IDS.translate( int(idnum) ) 1122 if idnum is not None: 1123 if path.startswith(b'"'): 1124 path = PathQuoting.dequote(path) 1125 filechange = FileChange(b'M', path, idnum, mode) 1126 else: 1127 filechange = b'skipped' 1128 self._advance_currentline() 1129 elif changetype == b'D': 1130 (changetype, path) = self._currentline.split(None, 1) 1131 path = path.rstrip(b'\n') 1132 if path.startswith(b'"'): 1133 path = PathQuoting.dequote(path) 1134 filechange = FileChange(b'D', path) 1135 self._advance_currentline() 1136 elif changetype == b'R': # pragma: no cover (now avoid fast-export renames) 1137 rest = self._currentline[2:-1] 1138 if rest.startswith(b'"'): 1139 m = self._quoted_string_re.match(rest) 1140 if not m: 1141 raise SystemExit(_("Couldn't parse rename source")) 1142 orig = PathQuoting.dequote(m.group(0)) 1143 new = rest[m.end()+1:] 1144 else: 1145 orig, new = rest.split(b' ', 1) 1146 if new.startswith(b'"'): 1147 new = PathQuoting.dequote(new) 1148 filechange = FileChange(b'R', orig, new) 1149 self._advance_currentline() 1150 return filechange 1151 1152 def _parse_original_id(self): 1153 original_id = self._currentline[len(b'original-oid '):].rstrip() 1154 self._advance_currentline() 1155 return original_id 1156 1157 def _parse_encoding(self): 1158 encoding = self._currentline[len(b'encoding '):].rstrip() 1159 self._advance_currentline() 1160 return encoding 1161 1162 def _parse_ref_line(self, refname): 1163 """ 1164 Parses string data (often a branch name) from current-line. The name of 1165 the string data must match the refname arg. The program will crash if 1166 current-line does not match, so current-line will always be advanced if 1167 this method returns. 1168 """ 1169 matches = self._refline_regexes[refname].match(self._currentline) 1170 if not matches: 1171 raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") % 1172 ({'refname': refname, 'line':self._currentline}) 1173 ) # pragma: no cover 1174 ref = matches.group(1) 1175 self._advance_currentline() 1176 return ref 1177 1178 def _parse_user(self, usertype): 1179 """ 1180 Get user name, email, datestamp from current-line. Current-line will 1181 be advanced. 1182 """ 1183 user_regex = self._user_regexes[usertype] 1184 (name, email, when) = user_regex.match(self._currentline).groups() 1185 1186 self._advance_currentline() 1187 return (name, email, when) 1188 1189 def _parse_data(self): 1190 """ 1191 Reads data from _input. Current-line will be advanced until it is beyond 1192 the data. 1193 """ 1194 fields = self._currentline.split() 1195 assert fields[0] == b'data' 1196 size = int(fields[1]) 1197 data = self._input.read(size) 1198 self._advance_currentline() 1199 if self._currentline == b'\n': 1200 self._advance_currentline() 1201 return data 1202 1203 def _parse_blob(self): 1204 """ 1205 Parse input data into a Blob object. Once the Blob has been created, it 1206 will be handed off to the appropriate callbacks. Current-line will be 1207 advanced until it is beyond this blob's data. The Blob will be dumped 1208 to _output once everything else is done (unless it has been skipped by 1209 the callback). 1210 """ 1211 # Parse the Blob 1212 self._advance_currentline() 1213 id_ = self._parse_optional_mark() 1214 1215 original_id = None 1216 if self._currentline.startswith(b'original-oid'): 1217 original_id = self._parse_original_id(); 1218 1219 data = self._parse_data() 1220 if self._currentline == b'\n': 1221 self._advance_currentline() 1222 1223 # Create the blob 1224 blob = Blob(data, original_id) 1225 1226 # If fast-export text had a mark for this blob, need to make sure this 1227 # mark translates to the blob's true id. 1228 if id_: 1229 blob.old_id = id_ 1230 _IDS.record_rename(id_, blob.id) 1231 1232 # Check for LFS objects 1233 if self._lfs_object_tracker: 1234 self._lfs_object_tracker.check_blob_data(data, blob.old_id, True) 1235 1236 # Call any user callback to allow them to use/modify the blob 1237 if self._blob_callback: 1238 self._blob_callback(blob) 1239 1240 # Now print the resulting blob 1241 if not blob.dumped: 1242 blob.dump(self._output) 1243 1244 def _parse_reset(self): 1245 """ 1246 Parse input data into a Reset object. Once the Reset has been created, 1247 it will be handed off to the appropriate callbacks. Current-line will 1248 be advanced until it is beyond the reset data. The Reset will be dumped 1249 to _output once everything else is done (unless it has been skipped by 1250 the callback). 1251 """ 1252 # Parse the Reset 1253 ref = self._parse_ref_line(b'reset') 1254 self._exported_refs.add(ref) 1255 ignoreme, from_ref = self._parse_optional_parent_ref(b'from') 1256 if self._currentline == b'\n': 1257 self._advance_currentline() 1258 1259 # fast-export likes to print extraneous resets that serve no purpose. 1260 # While we could continue processing such resets, that is a waste of 1261 # resources. Also, we want to avoid recording that this ref was 1262 # seen in such cases, since this ref could be rewritten to nothing. 1263 if not from_ref: 1264 self._latest_commit.pop(ref, None) 1265 self._latest_orig_commit.pop(ref, None) 1266 return 1267 1268 # Create the reset 1269 reset = Reset(ref, from_ref) 1270 1271 # Call any user callback to allow them to modify the reset 1272 if self._reset_callback: 1273 self._reset_callback(reset) 1274 1275 # Update metadata 1276 self._latest_commit[reset.ref] = reset.from_ref 1277 self._latest_orig_commit[reset.ref] = reset.from_ref 1278 1279 # Now print the resulting reset 1280 if not reset.dumped: 1281 self._imported_refs.add(reset.ref) 1282 reset.dump(self._output) 1283 1284 def _parse_commit(self): 1285 """ 1286 Parse input data into a Commit object. Once the Commit has been created, 1287 it will be handed off to the appropriate callbacks. Current-line will 1288 be advanced until it is beyond the commit data. The Commit will be dumped 1289 to _output once everything else is done (unless it has been skipped by 1290 the callback OR the callback has removed all file-changes from the commit). 1291 """ 1292 # Parse the Commit. This may look involved, but it's pretty simple; it only 1293 # looks bad because a commit object contains many pieces of data. 1294 branch = self._parse_ref_line(b'commit') 1295 self._exported_refs.add(branch) 1296 id_ = self._parse_optional_mark() 1297 1298 original_id = None 1299 if self._currentline.startswith(b'original-oid'): 1300 original_id = self._parse_original_id(); 1301 1302 author_name = None 1303 author_email = None 1304 if self._currentline.startswith(b'author'): 1305 (author_name, author_email, author_date) = self._parse_user(b'author') 1306 1307 (committer_name, committer_email, committer_date) = \ 1308 self._parse_user(b'committer') 1309 1310 if not author_name and not author_email: 1311 (author_name, author_email, author_date) = \ 1312 (committer_name, committer_email, committer_date) 1313 1314 encoding = None 1315 if self._currentline.startswith(b'encoding '): 1316 encoding = self._parse_encoding() 1317 1318 commit_msg = self._parse_data() 1319 1320 pinfo = [self._parse_optional_parent_ref(b'from')] 1321 # Due to empty pruning, we can have real 'from' and 'merge' lines that 1322 # due to commit rewriting map to a parent of None. We need to record 1323 # 'from' if its non-None, and we need to parse all 'merge' lines. 1324 while self._currentline.startswith(b'merge '): 1325 pinfo.append(self._parse_optional_parent_ref(b'merge')) 1326 orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)] 1327 1328 # No parents is oddly represented as [None] instead of [], due to the 1329 # special 'from' handling. Convert it here to a more canonical form. 1330 if parents == [None]: 1331 parents = [] 1332 if orig_parents == [None]: 1333 orig_parents = [] 1334 1335 # fast-import format is kinda stupid in that it allows implicit parents 1336 # based on the branch name instead of requiring them to be specified by 1337 # 'from' directives. The only way to get no parent is by using a reset 1338 # directive first, which clears the latest_commit_for_this_branch tracking. 1339 if not orig_parents and self._latest_commit.get(branch): 1340 parents = [self._latest_commit[branch]] 1341 if not orig_parents and self._latest_orig_commit.get(branch): 1342 orig_parents = [self._latest_orig_commit[branch]] 1343 1344 # Get the list of file changes 1345 file_changes = [] 1346 file_change = self._parse_optional_filechange() 1347 had_file_changes = file_change is not None 1348 while file_change: 1349 if not (type(file_change) == bytes and file_change == b'skipped'): 1350 file_changes.append(file_change) 1351 file_change = self._parse_optional_filechange() 1352 if self._currentline == b'\n': 1353 self._advance_currentline() 1354 1355 # Okay, now we can finally create the Commit object 1356 commit = Commit(branch, 1357 author_name, author_email, author_date, 1358 committer_name, committer_email, committer_date, 1359 commit_msg, file_changes, parents, original_id, encoding) 1360 1361 # If fast-export text had a mark for this commit, need to make sure this 1362 # mark translates to the commit's true id. 1363 if id_: 1364 commit.old_id = id_ 1365 _IDS.record_rename(id_, commit.id) 1366 1367 # refs/notes/ put commit-message-related material in blobs, and name their 1368 # files according to the hash of other commits. That totally messes with 1369 # all normal callbacks; fast-export should really export these as different 1370 # kinds of objects. Until then, let's just pass these commits through as-is 1371 # and hope the blob callbacks don't mess things up. 1372 if commit.branch.startswith(b'refs/notes/'): 1373 self._imported_refs.add(commit.branch) 1374 commit.dump(self._output) 1375 return 1376 1377 # Call any user callback to allow them to modify the commit 1378 aux_info = {'orig_parents': orig_parents, 1379 'had_file_changes': had_file_changes} 1380 if self._commit_callback: 1381 self._commit_callback(commit, aux_info) 1382 1383 # Now print the resulting commit, or if prunable skip it 1384 self._latest_orig_commit[branch] = commit.id 1385 if not (commit.old_id or commit.id) in _SKIPPED_COMMITS: 1386 self._latest_commit[branch] = commit.id 1387 if not commit.dumped: 1388 self._imported_refs.add(commit.branch) 1389 commit.dump(self._output) 1390 1391 def _parse_tag(self): 1392 """ 1393 Parse input data into a Tag object. Once the Tag has been created, 1394 it will be handed off to the appropriate callbacks. Current-line will 1395 be advanced until it is beyond the tag data. The Tag will be dumped 1396 to _output once everything else is done (unless it has been skipped by 1397 the callback). 1398 """ 1399 # Parse the Tag 1400 tag = self._parse_ref_line(b'tag') 1401 self._exported_refs.add(b'refs/tags/'+tag) 1402 id_ = self._parse_optional_mark() 1403 ignoreme, from_ref = self._parse_optional_parent_ref(b'from') 1404 1405 original_id = None 1406 if self._currentline.startswith(b'original-oid'): 1407 original_id = self._parse_original_id(); 1408 1409 tagger_name, tagger_email, tagger_date = None, None, None 1410 if self._currentline.startswith(b'tagger'): 1411 (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger') 1412 tag_msg = self._parse_data() 1413 if self._currentline == b'\n': 1414 self._advance_currentline() 1415 1416 # Create the tag 1417 tag = Tag(tag, from_ref, 1418 tagger_name, tagger_email, tagger_date, tag_msg, 1419 original_id) 1420 1421 # If fast-export text had a mark for this tag, need to make sure this 1422 # mark translates to the tag's true id. 1423 if id_: 1424 tag.old_id = id_ 1425 _IDS.record_rename(id_, tag.id) 1426 1427 # Call any user callback to allow them to modify the tag 1428 if self._tag_callback: 1429 self._tag_callback(tag) 1430 1431 # The tag might not point at anything that still exists (self.from_ref 1432 # will be None if the commit it pointed to and all its ancestors were 1433 # pruned due to being empty) 1434 if tag.from_ref: 1435 # Print out this tag's information 1436 if not tag.dumped: 1437 self._imported_refs.add(b'refs/tags/'+tag.ref) 1438 tag.dump(self._output) 1439 else: 1440 tag.skip() 1441 1442 def _parse_progress(self): 1443 """ 1444 Parse input data into a Progress object. Once the Progress has 1445 been created, it will be handed off to the appropriate 1446 callbacks. Current-line will be advanced until it is beyond the 1447 progress data. The Progress will be dumped to _output once 1448 everything else is done (unless it has been skipped by the callback). 1449 """ 1450 # Parse the Progress 1451 message = self._parse_ref_line(b'progress') 1452 if self._currentline == b'\n': 1453 self._advance_currentline() 1454 1455 # Create the progress message 1456 progress = Progress(message) 1457 1458 # Call any user callback to allow them to modify the progress messsage 1459 if self._progress_callback: 1460 self._progress_callback(progress) 1461 1462 # NOTE: By default, we do NOT print the progress message; git 1463 # fast-import would write it to fast_import_pipes which could mess with 1464 # our parsing of output from the 'ls' and 'get-mark' directives we send 1465 # to fast-import. If users want these messages, they need to process 1466 # and handle them in the appropriate callback above. 1467 1468 def _parse_checkpoint(self): 1469 """ 1470 Parse input data into a Checkpoint object. Once the Checkpoint has 1471 been created, it will be handed off to the appropriate 1472 callbacks. Current-line will be advanced until it is beyond the 1473 checkpoint data. The Checkpoint will be dumped to _output once 1474 everything else is done (unless it has been skipped by the callback). 1475 """ 1476 # Parse the Checkpoint 1477 self._advance_currentline() 1478 if self._currentline == b'\n': 1479 self._advance_currentline() 1480 1481 # Create the checkpoint 1482 checkpoint = Checkpoint() 1483 1484 # Call any user callback to allow them to drop the checkpoint 1485 if self._checkpoint_callback: 1486 self._checkpoint_callback(checkpoint) 1487 1488 # NOTE: By default, we do NOT print the checkpoint message; although it 1489 # we would only realistically get them with --stdin, the fact that we 1490 # are filtering makes me think the checkpointing is less likely to be 1491 # reasonable. In fact, I don't think it's necessary in general. If 1492 # users do want it, they should process it in the checkpoint_callback. 1493 1494 def _parse_literal_command(self): 1495 """ 1496 Parse literal command. Then just dump the line as is. 1497 """ 1498 # Create the literal command object 1499 command = LiteralCommand(self._currentline) 1500 self._advance_currentline() 1501 1502 # Now print the resulting literal command 1503 if not command.dumped: 1504 command.dump(self._output) 1505 1506 def insert(self, obj): 1507 assert not obj.dumped 1508 obj.dump(self._output) 1509 if type(obj) == Commit: 1510 self._imported_refs.add(obj.branch) 1511 elif type(obj) in (Reset, Tag): 1512 self._imported_refs.add(obj.ref) 1513 1514 def run(self, input, output): 1515 """ 1516 This method filters fast export output. 1517 """ 1518 # Set input. If no args provided, use stdin. 1519 self._input = input 1520 self._output = output 1521 1522 # Run over the input and do the filtering 1523 self._advance_currentline() 1524 while self._currentline: 1525 if self._currentline.startswith(b'blob'): 1526 self._parse_blob() 1527 elif self._currentline.startswith(b'reset'): 1528 self._parse_reset() 1529 elif self._currentline.startswith(b'commit'): 1530 self._parse_commit() 1531 elif self._currentline.startswith(b'tag'): 1532 self._parse_tag() 1533 elif self._currentline.startswith(b'progress'): 1534 self._parse_progress() 1535 elif self._currentline.startswith(b'checkpoint'): 1536 self._parse_checkpoint() 1537 elif self._currentline.startswith(b'feature'): 1538 self._parse_literal_command() 1539 elif self._currentline.startswith(b'option'): 1540 self._parse_literal_command() 1541 elif self._currentline.startswith(b'done'): 1542 if self._done_callback: 1543 self._done_callback() 1544 self._parse_literal_command() 1545 # Prevent confusion from others writing additional stuff that'll just 1546 # be ignored 1547 self._output.close() 1548 elif self._currentline.startswith(b'#'): 1549 self._parse_literal_command() 1550 elif self._currentline.startswith(b'get-mark') or \ 1551 self._currentline.startswith(b'cat-blob') or \ 1552 self._currentline.startswith(b'ls'): 1553 raise SystemExit(_("Unsupported command: '%s'") % self._currentline) 1554 else: 1555 raise SystemExit(_("Could not parse line: '%s'") % self._currentline) 1556 1557 def get_exported_and_imported_refs(self): 1558 return self._exported_refs, self._imported_refs 1559 1560def record_id_rename(old_id, new_id): 1561 """ 1562 Register a new translation 1563 """ 1564 handle_transitivity = True 1565 _IDS.record_rename(old_id, new_id, handle_transitivity) 1566 1567# Internal globals 1568_IDS = _IDs() 1569_SKIPPED_COMMITS = set() 1570BLOB_HASH_TO_NEW_ID = {} 1571BLOB_NEW_ID_TO_HASH = {} 1572sdr_next_steps = _(""" 1573NEXT STEPS FOR YOUR SENSITIVE DATA REMOVAL: 1574 * If you are doing your rewrite in multiple steps, ignore these next steps 1575 until you have completed all your invocations of git-filter-repo. 1576 * See the "Sensitive Data Removal" subsection of the "DISCUSSION" section 1577 of the manual for more details about any of the steps below. 1578 * Inspect this repository and verify that the sensitive data is indeed 1579 completely removed from all commits. 1580 * Force push the rewritten history to the server: 1581 %s 1582 * Contact the server admins for additional steps they need to take; the 1583 First Changed Commit(s)%s may come in handy here. 1584 * Have other colleagues with a clone either discard their clone and reclone 1585 OR follow the detailed steps in the manual to repeatedly rebase and 1586 purge the sensitive data from their copy. Again, the First Changed 1587 Commit(s)%s may come in handy. 1588 * See the "Prevent repeats and avoid future sensitive data spills" section 1589 of the manual. 1590"""[1:]) 1591 1592class SubprocessWrapper(object): 1593 @staticmethod 1594 def decodify(args): 1595 if type(args) == str: 1596 return args 1597 else: 1598 assert type(args) == list 1599 return [decode(x) if type(x)==bytes else x for x in args] 1600 1601 @staticmethod 1602 def call(*args, **kwargs): 1603 if 'cwd' in kwargs: 1604 kwargs['cwd'] = decode(kwargs['cwd']) 1605 return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs) 1606 1607 @staticmethod 1608 def check_output(*args, **kwargs): 1609 if 'cwd' in kwargs: 1610 kwargs['cwd'] = decode(kwargs['cwd']) 1611 return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs) 1612 1613 @staticmethod 1614 def check_call(*args, **kwargs): # pragma: no cover # used by filter-lamely 1615 if 'cwd' in kwargs: 1616 kwargs['cwd'] = decode(kwargs['cwd']) 1617 return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs) 1618 1619 @staticmethod 1620 def Popen(*args, **kwargs): 1621 if 'cwd' in kwargs: 1622 kwargs['cwd'] = decode(kwargs['cwd']) 1623 return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs) 1624 1625subproc = subprocess 1626if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ: 1627 subproc = SubprocessWrapper 1628 1629class GitUtils(object): 1630 @staticmethod 1631 def get_commit_count(repo, *args): 1632 """ 1633 Return the number of commits that have been made on repo. 1634 """ 1635 if not args: 1636 args = ['--all'] 1637 if len(args) == 1 and isinstance(args[0], list): 1638 args = args[0] 1639 p = subproc.Popen(["git", "rev-list", "--count"] + args, 1640 stdout=subprocess.PIPE, stderr=subprocess.PIPE, 1641 cwd=repo) 1642 if p.wait() != 0: 1643 raise SystemExit(_("%s does not appear to be a valid git repository") 1644 % decode(repo)) 1645 return int(p.stdout.read()) 1646 1647 @staticmethod 1648 def get_total_objects(repo): 1649 """ 1650 Return the number of objects (both packed and unpacked) 1651 """ 1652 p1 = subproc.Popen(["git", "count-objects", "-v"], 1653 stdout=subprocess.PIPE, cwd=repo) 1654 lines = p1.stdout.read().splitlines() 1655 # Return unpacked objects + packed-objects 1656 return int(lines[0].split()[1]) + int(lines[2].split()[1]) 1657 1658 @staticmethod 1659 def is_repository_bare(repo_working_dir): 1660 out = subproc.check_output('git rev-parse --is-bare-repository'.split(), 1661 cwd=repo_working_dir) 1662 return (out.strip() == b'true') 1663 1664 @staticmethod 1665 def determine_git_dir(repo_working_dir): 1666 d = subproc.check_output('git rev-parse --git-dir'.split(), 1667 cwd=repo_working_dir).strip() 1668 if repo_working_dir==b'.' or d.startswith(b'/'): 1669 return d 1670 return os.path.join(repo_working_dir, d) 1671 1672 @staticmethod 1673 def get_refs(repo_working_dir): 1674 try: 1675 output = subproc.check_output('git show-ref'.split(), 1676 cwd=repo_working_dir) 1677 except subprocess.CalledProcessError as e: 1678 # If error code is 1, there just aren't any refs; i.e. new repo. 1679 # If error code is other than 1, some other error (e.g. not a git repo) 1680 if e.returncode != 1: 1681 raise SystemExit('fatal: {}'.format(e)) 1682 output = '' 1683 return dict(reversed(x.split()) for x in output.splitlines()) 1684 1685 @staticmethod 1686 def get_config_settings(repo_working_dir): 1687 output = '' 1688 try: 1689 output = subproc.check_output('git config --list --null'.split(), 1690 cwd=repo_working_dir) 1691 except subprocess.CalledProcessError as e: # pragma: no cover 1692 raise SystemExit('fatal: {}'.format(e)) 1693 1694 # FIXME: Ignores multi-valued keys, just let them overwrite for now 1695 return dict(item.split(b'\n', maxsplit=1) 1696 for item in output.strip().split(b"\0") if item) 1697 1698 @staticmethod 1699 def get_blob_sizes(quiet = False): 1700 blob_size_progress = ProgressWriter() 1701 num_blobs = 0 1702 processed_blobs_msg = _("Processed %d blob sizes") 1703 1704 # Get sizes of blobs by sha1 1705 cmd = '--batch-check=%(objectname) %(objecttype) ' + \ 1706 '%(objectsize) %(objectsize:disk)' 1707 cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd], 1708 bufsize = -1, 1709 stdout = subprocess.PIPE) 1710 unpacked_size = {} 1711 packed_size = {} 1712 for line in cf.stdout: 1713 try: 1714 sha, objtype, objsize, objdisksize = line.split() 1715 objsize, objdisksize = int(objsize), int(objdisksize) 1716 if objtype == b'blob': 1717 unpacked_size[sha] = objsize 1718 packed_size[sha] = objdisksize 1719 num_blobs += 1 1720 except ValueError: # pragma: no cover 1721 sys.stderr.write(_("Error: unexpected `git cat-file` output: \"%s\"\n") % line) 1722 if not quiet: 1723 blob_size_progress.show(processed_blobs_msg % num_blobs) 1724 cf.wait() 1725 if not quiet: 1726 blob_size_progress.finish() 1727 return unpacked_size, packed_size 1728 1729 @staticmethod 1730 def get_file_changes(repo, parent_hash, commit_hash): 1731 """ 1732 Return a FileChanges list with the differences between parent_hash 1733 and commit_hash 1734 """ 1735 file_changes = [] 1736 1737 cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash] 1738 output = subproc.check_output(cmd, cwd=repo) 1739 for line in output.splitlines(): 1740 fileinfo, path = line.split(b'\t', 1) 1741 if path.startswith(b'"'): 1742 path = PathQuoting.dequote(path) 1743 oldmode, mode, oldhash, newhash, changetype = fileinfo.split() 1744 if changetype == b'D': 1745 file_changes.append(FileChange(b'D', path)) 1746 elif changetype in (b'A', b'M', b'T'): 1747 identifier = BLOB_HASH_TO_NEW_ID.get(newhash, newhash) 1748 file_changes.append(FileChange(b'M', path, identifier, mode)) 1749 else: # pragma: no cover 1750 raise SystemExit("Unknown change type for line {}".format(line)) 1751 1752 return file_changes 1753 1754 @staticmethod 1755 def print_my_version(): 1756 with open(__file__, 'br') as f: 1757 contents = f.read() 1758 # If people replaced @@LOCALEDIR@@ string to point at their local 1759 # directory, undo it so we can get original source version. 1760 contents = re.sub(br'\A#\!.*', 1761 br'#!/usr/bin/env python3', contents) 1762 contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"', 1763 br'\1@@LOCALEDIR@@"', contents) 1764 1765 cmd = 'git hash-object --stdin'.split() 1766 version = subproc.check_output(cmd, input=contents).strip() 1767 print(decode(version[0:12])) 1768 1769class FilteringOptions(object): 1770 default_replace_text = b'***REMOVED***' 1771 class AppendFilter(argparse.Action): 1772 def __call__(self, parser, namespace, values, option_string=None): 1773 user_path = values 1774 suffix = option_string[len('--path-'):] or 'match' 1775 if suffix.startswith('rename'): 1776 mod_type = 'rename' 1777 match_type = option_string[len('--path-rename-'):] or 'match' 1778 values = values.split(b':') 1779 if len(values) != 2: 1780 raise SystemExit(_("Error: --path-rename expects one colon in its" 1781 " argument: <old_name:new_name>.")) 1782 if values[0] and values[1] and not ( 1783 values[0].endswith(b'/') == values[1].endswith(b'/')): 1784 raise SystemExit(_("Error: With --path-rename, if OLD_NAME and " 1785 "NEW_NAME are both non-empty and either ends " 1786 "with a slash then both must.")) 1787 if any(v.startswith(b'/') for v in values): 1788 raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) 1789 components = values[0].split(b'/') + values[1].split(b'/') 1790 else: 1791 mod_type = 'filter' 1792 match_type = suffix 1793 components = values.split(b'/') 1794 if values.startswith(b'/'): 1795 raise SystemExit(_("Error: Pathnames cannot begin with a '/'")) 1796 for illegal_path in [b'.', b'..']: 1797 if illegal_path in components: 1798 raise SystemExit(_("Error: Invalid path component '%s' found in '%s'") 1799 % (decode(illegal_path), decode(user_path))) 1800 if match_type == 'regex': 1801 values = re.compile(values) 1802 items = getattr(namespace, self.dest, []) or [] 1803 items.append((mod_type, match_type, values)) 1804 if (match_type, mod_type) == ('glob', 'filter'): 1805 if not values.endswith(b'*'): 1806 extension = b'*' if values.endswith(b'/') else b'/*' 1807 items.append((mod_type, match_type, values+extension)) 1808 setattr(namespace, self.dest, items) 1809 1810 class HelperFilter(argparse.Action): 1811 def __call__(self, parser, namespace, values, option_string=None): 1812 af = FilteringOptions.AppendFilter(dest='path_changes', 1813 option_strings=None) 1814 dirname = values if values[-1:] == b'/' else values+b'/' 1815 if option_string == '--subdirectory-filter': 1816 af(parser, namespace, dirname, '--path-match') 1817 af(parser, namespace, dirname+b':', '--path-rename') 1818 elif option_string == '--to-subdirectory-filter': 1819 af(parser, namespace, b':'+dirname, '--path-rename') 1820 else: 1821 raise SystemExit(_("Error: HelperFilter given invalid option_string: %s") 1822 % option_string) # pragma: no cover 1823 1824 class FileWithPathsFilter(argparse.Action): 1825 def __call__(self, parser, namespace, values, option_string=None): 1826 if not namespace.path_changes: 1827 namespace.path_changes = [] 1828 namespace.path_changes += FilteringOptions.get_paths_from_file(values) 1829 1830 @staticmethod 1831 def create_arg_parser(): 1832 # Include usage in the summary, so we can put the description first 1833 summary = _('''Rewrite (or analyze) repository history 1834 1835 git-filter-repo destructively rewrites history (unless --analyze or 1836 --dry-run are given) according to specified rules. It refuses to do any 1837 rewriting unless either run from a clean fresh clone, or --force was 1838 given. 1839 1840 Basic Usage: 1841 git-filter-repo --analyze 1842 git-filter-repo [FILTER/RENAME/CONTROL OPTIONS] 1843 1844 See EXAMPLES section for details. 1845 ''').rstrip() 1846 1847 # Provide a long helpful examples section 1848 example_text = _('''CALLBACKS 1849 1850 Most callback functions are of the same general format. For a command line 1851 argument like 1852 --foo-callback 'BODY' 1853 1854 the following code will be compiled and called: 1855 def foo_callback(foo): 1856 BODY 1857 1858 The exception on callbacks is the --file-info-callback, which will be 1859 discussed further below. 1860 1861 Given the callback style, we can thus make a simple callback to replace 1862 'Jon' with 'John' in author/committer/tagger names: 1863 git filter-repo --name-callback 'return name.replace(b"Jon", b"John")' 1864 1865 To remove all 'Tested-by' tags in commit (or tag) messages: 1866 git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)' 1867 1868 To remove all .DS_Store files: 1869 git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename' 1870 1871 Note that if BODY resolves to a filename, then the contents of that file 1872 will be used as the BODY in the callback function. 1873 1874 The --file-info-callback has a more involved function callback; for it the 1875 following code will be compiled and called: 1876 def file_info_callback(filename, mode, blob_id, value): 1877 BODY 1878 1879 It is designed to be used in cases where filtering depends on both 1880 filename and contents (and maybe mode). It is called for file changes 1881 other than deletions (since deletions have no file contents to operate 1882 on). This callback is expected to return a tuple of (filename, mode, 1883 blob_id). It can make use of the following functions from the value 1884 instance: 1885 value.get_contents_by_identifier(blob_id) -> contents (bytestring) 1886 value.get_size_by_identifier(blob_id) -> size_of_blob (int) 1887 value.insert_file_with_contents(contents) -> blob_id 1888 value.is_binary(contents) -> bool 1889 value.apply_replace_text(contents) -> new_contents (bytestring) 1890 and can read/write the following data member from the value instance: 1891 value.data (dict) 1892 1893 The filename can be used for renaming the file similar to 1894 --filename-callback (or None to drop the change), and mode is one 1895 of b'100644', b'100755', b'120000', or b'160000'. 1896 1897 For more detailed examples and explanations AND caveats, see 1898 https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS 1899 1900EXAMPLES 1901 1902 To get a bunch of reports mentioning renames that have occurred in 1903 your repo and listing sizes of objects aggregated by any of path, 1904 directory, extension, or blob-id: 1905 git filter-repo --analyze 1906 1907 (These reports can help you choose how to filter your repo; it can 1908 be useful to re-run this command after filtering to regenerate the 1909 report and verify the changes look correct.) 1910 1911 To extract the history that touched just 'guides' and 'tools/releases': 1912 git filter-repo --path guides/ --path tools/releases 1913 1914 To remove foo.zip and bar/baz/zips from every revision in history: 1915 git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths 1916 1917 To replace the text 'password' with 'p455w0rd': 1918 git filter-repo --replace-text <(echo "password==>p455w0rd") 1919 1920 To use the current version of the .mailmap file to update authors, 1921 committers, and taggers throughout history and make it permanent: 1922 git filter-repo --use-mailmap 1923 1924 To extract the history of 'src/', rename all files to have a new leading 1925 directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and 1926 add a 'my-module-' prefix to all tags: 1927 git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-' 1928 1929 For more detailed examples and explanations, see 1930 https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''') 1931 1932 # Create the basic parser 1933 parser = argparse.ArgumentParser(description=summary, 1934 usage = argparse.SUPPRESS, 1935 add_help = False, 1936 epilog = example_text, 1937 formatter_class=argparse.RawDescriptionHelpFormatter) 1938 1939 analyze = parser.add_argument_group(title=_("Analysis")) 1940 analyze.add_argument('--analyze', action='store_true', 1941 help=_("Analyze repository history and create a report that may be " 1942 "useful in determining what to filter in a subsequent run. " 1943 "Will not modify your repo.")) 1944 analyze.add_argument('--report-dir', 1945 metavar='DIR_OR_FILE', 1946 type=os.fsencode, 1947 dest='report_dir', 1948 help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis," 1949 "refuses to run if exists, --force delete existing dir first.")) 1950 1951 path = parser.add_argument_group(title=_("Filtering based on paths " 1952 "(see also --filename-callback)"), 1953 description=textwrap.dedent(_(""" 1954 These options specify the paths to select. Note that much like git 1955 itself, renames are NOT followed so you may need to specify multiple 1956 paths, e.g. `--path olddir/ --path newdir/` 1957 """[1:]))) 1958 1959 path.add_argument('--invert-paths', action='store_false', dest='inclusive', 1960 help=_("Invert the selection of files from the specified " 1961 "--path-{match,glob,regex} options below, i.e. only select " 1962 "files matching none of those options.")) 1963 1964 path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE', 1965 type=os.fsencode, 1966 action=FilteringOptions.AppendFilter, dest='path_changes', 1967 help=_("Exact paths (files or directories) to include in filtered " 1968 "history. Multiple --path options can be specified to get " 1969 "a union of paths.")) 1970 path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode, 1971 action=FilteringOptions.AppendFilter, dest='path_changes', 1972 help=_("Glob of paths to include in filtered history. Multiple " 1973 "--path-glob options can be specified to get a union of " 1974 "paths.")) 1975 path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode, 1976 action=FilteringOptions.AppendFilter, dest='path_changes', 1977 help=_("Regex of paths to include in filtered history. Multiple " 1978 "--path-regex options can be specified to get a union of " 1979 "paths")) 1980 path.add_argument('--use-base-name', action='store_true', 1981 help=_("Match on file base name instead of full path from the top " 1982 "of the repo. Incompatible with --path-rename, and " 1983 "incompatible with matching against directory names.")) 1984 1985 rename = parser.add_argument_group(title=_("Renaming based on paths " 1986 "(see also --filename-callback)")) 1987 rename.add_argument('--path-rename', '--path-rename-match', 1988 metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode, 1989 action=FilteringOptions.AppendFilter, 1990 help=_("Path to rename; if filename or directory matches OLD_NAME " 1991 "rename to NEW_NAME. Multiple --path-rename options can be " 1992 "specified. NOTE: If you combine filtering options with " 1993 "renaming ones, do not rely on a rename argument to select " 1994 "paths; you also need a filter to select them.")) 1995 1996 helpers = parser.add_argument_group(title=_("Path shortcuts")) 1997 helpers.add_argument('--paths', help=argparse.SUPPRESS, metavar='IGNORE') 1998 helpers.add_argument('--paths-from-file', metavar='FILENAME', 1999 type=os.fsencode, 2000 action=FilteringOptions.FileWithPathsFilter, dest='path_changes', 2001 help=_("Specify several path filtering and renaming directives, one " 2002 "per line. Lines with '==>' in them specify path renames, " 2003 "and lines can begin with 'literal:' (the default), 'glob:', " 2004 "or 'regex:' to specify different matching styles. Blank " 2005 "lines and lines starting with a '#' are ignored.")) 2006 helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY', 2007 action=FilteringOptions.HelperFilter, type=os.fsencode, 2008 help=_("Only look at history that touches the given subdirectory " 2009 "and treat that directory as the project root. Equivalent " 2010 "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'")) 2011 helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY', 2012 action=FilteringOptions.HelperFilter, type=os.fsencode, 2013 help=_("Treat the project root as if it were under DIRECTORY. " 2014 "Equivalent to using '--path-rename :DIRECTORY/'")) 2015 2016 contents = parser.add_argument_group(title=_("Content editing filters " 2017 "(see also --blob-callback)")) 2018 contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE', 2019 help=_("A file with expressions that, if found, will be replaced. " 2020 "By default, each expression is treated as literal text, " 2021 "but 'regex:' and 'glob:' prefixes are supported. You can " 2022 "end the line with '==>' and some replacement text to " 2023 "choose a replacement choice other than the default of '{}'." 2024 .format(decode(FilteringOptions.default_replace_text)))) 2025 contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE', 2026 dest='max_blob_size', default=0, 2027 help=_("Strip blobs (files) bigger than specified size (e.g. '5M', " 2028 "'2G', etc)")) 2029 contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME', 2030 help=_("Read git object ids from each line of the given file, and " 2031 "strip all of them from history")) 2032 2033 refrename = parser.add_argument_group(title=_("Renaming of refs " 2034 "(see also --refname-callback)")) 2035 refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode, 2036 help=_("Rename tags starting with OLD to start with NEW. For " 2037 "example, --tag-rename foo:bar will rename tag foo-1.2.3 " 2038 "to bar-1.2.3; either OLD or NEW can be empty.")) 2039 2040 messages = parser.add_argument_group(title=_("Filtering of commit messages " 2041 "(see also --message-callback)")) 2042 messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE', 2043 help=_("A file with expressions that, if found in commit or tag " 2044 "messages, will be replaced. This file uses the same syntax " 2045 "as --replace-text.")) 2046 messages.add_argument('--preserve-commit-hashes', action='store_true', 2047 help=_("By default, since commits are rewritten and thus gain new " 2048 "hashes, references to old commit hashes in commit messages " 2049 "are replaced with new commit hashes (abbreviated to the same " 2050 "length as the old reference). Use this flag to turn off " 2051 "updating commit hashes in commit messages.")) 2052 messages.add_argument('--preserve-commit-encoding', action='store_true', 2053 help=_("Do not reencode commit messages into UTF-8. By default, if " 2054 "the commit object specifies an encoding for the commit " 2055 "message, the message is re-encoded into UTF-8.")) 2056 2057 people = parser.add_argument_group(title=_("Filtering of names & emails " 2058 "(see also --name-callback " 2059 "and --email-callback)")) 2060 people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME', 2061 type=os.fsencode, 2062 help=_("Use specified mailmap file (see git-shortlog(1) for " 2063 "details on the format) when rewriting author, committer, " 2064 "and tagger names and emails. If the specified file is " 2065 "part of git history, historical versions of the file will " 2066 "be ignored; only the current contents are consulted.")) 2067 people.add_argument('--use-mailmap', dest='mailmap', 2068 action='store_const', const=b'.mailmap', 2069 help=_("Same as: '--mailmap .mailmap' ")) 2070 2071 parents = parser.add_argument_group(title=_("Parent rewriting")) 2072 parents.add_argument('--replace-refs', default=None, 2073 choices=['delete-no-add', 'delete-and-add', 2074 'update-no-add', 'update-or-add', 2075 'update-and-add', 'old-default'], 2076 help=_("How to handle replace refs (see git-replace(1)). Replace " 2077 "refs can be added during the history rewrite as a way to " 2078 "allow users to pass old commit IDs (from before " 2079 "git-filter-repo was run) to git commands and have git know " 2080 "how to translate those old commit IDs to the new " 2081 "(post-rewrite) commit IDs. Also, replace refs that existed " 2082 "before the rewrite can either be deleted or updated. The " 2083 "choices to pass to --replace-refs thus need to specify both " 2084 "what to do with existing refs and what to do with commit " 2085 "rewrites. Thus 'update-and-add' means to update existing " 2086 "replace refs, and for any commit rewrite (even if already " 2087 "pointed at by a replace ref) add a new refs/replace/ reference " 2088 "to map from the old commit ID to the new commit ID. The " 2089 "default is update-no-add, meaning update existing replace refs " 2090 "but do not add any new ones. There is also a special " 2091 "'old-default' option for picking the default used in versions " 2092 "prior to git-filter-repo-2.45, namely 'update-and-add' upon " 2093 "the first run of git-filter-repo in a repository and " 2094 "'update-or-add' if running git-filter-repo again on a " 2095 "repository.")) 2096 parents.add_argument('--prune-empty', default='auto', 2097 choices=['always', 'auto', 'never'], 2098 help=_("Whether to prune empty commits. 'auto' (the default) means " 2099 "only prune commits which become empty (not commits which were " 2100 "empty in the original repo, unless their parent was pruned). " 2101 "When the parent of a commit is pruned, the first non-pruned " 2102 "ancestor becomes the new parent.")) 2103 parents.add_argument('--prune-degenerate', default='auto', 2104 choices=['always', 'auto', 'never'], 2105 help=_("Since merge commits are needed for history topology, they " 2106 "are typically exempt from pruning. However, they can become " 2107 "degenerate with the pruning of other commits (having fewer " 2108 "than two parents, having one commit serve as both parents, or " 2109 "having one parent as the ancestor of the other.) If such " 2110 "merge commits have no file changes, they can be pruned. The " 2111 "default ('auto') is to only prune empty merge commits which " 2112 "become degenerate (not which started as such).")) 2113 parents.add_argument('--no-ff', action='store_true', 2114 help=_("Even if the first parent is or becomes an ancestor of another " 2115 "parent, do not prune it. This modifies how " 2116 "--prune-degenerate behaves, and may be useful in projects who " 2117 "always use merge --no-ff.")) 2118 2119 callback = parser.add_argument_group(title=_("Generic callback code snippets")) 2120 callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE", 2121 help=_("Python code body for processing filenames; see CALLBACKS " 2122 "sections below.")) 2123 callback.add_argument('--file-info-callback', metavar="FUNCTION_BODY_OR_FILE", 2124 help=_("Python code body for processing file and metadata; see " 2125 "CALLBACKS sections below.")) 2126 callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE", 2127 help=_("Python code body for processing messages (both commit " 2128 "messages and tag messages); see CALLBACKS section below.")) 2129 callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE", 2130 help=_("Python code body for processing names of people; see " 2131 "CALLBACKS section below.")) 2132 callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE", 2133 help=_("Python code body for processing emails addresses; see " 2134 "CALLBACKS section below.")) 2135 callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE", 2136 help=_("Python code body for processing refnames; see CALLBACKS " 2137 "section below.")) 2138 2139 callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE", 2140 help=_("Python code body for processing blob objects; see " 2141 "CALLBACKS section below.")) 2142 callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE", 2143 help=_("Python code body for processing commit objects; see " 2144 "CALLBACKS section below.")) 2145 callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE", 2146 help=_("Python code body for processing tag objects. Note that " 2147 "lightweight tags have no tag object and are thus not " 2148 "handled by this callback. See CALLBACKS section below.")) 2149 callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE", 2150 help=_("Python code body for processing reset objects; see " 2151 "CALLBACKS section below.")) 2152 2153 sdr = parser.add_argument_group(title=_("Sensitive Data Removal Handling")) 2154 sdr.add_argument('--sensitive-data-removal', '--sdr', action='store_true', 2155 help=_("This rewrite is intended to remove sensitive data from a " 2156 "repository. Gather extra information from the rewrite needed " 2157 "to provide additional instructions on how to clean up other " 2158 "copies.")) 2159 sdr.add_argument('--no-fetch', action='store_true', 2160 help=_("By default, --sensitive-data-removal will trigger a " 2161 "mirror-like fetch of all refs from origin, discarding local " 2162 "changes, but ensuring that _all_ fetchable refs that hold on " 2163 "to the sensitve data are rewritten. This flag removes that " 2164 "fetch, risking that other refs continue holding on to the " 2165 "sensitive data. This option is implied by --partial or any " 2166 "flag that implies --partial.")) 2167 2168 desc = _( 2169 "Specifying alternate source or target locations implies --partial,\n" 2170 "except that the normal default for --replace-refs is used. However,\n" 2171 "unlike normal uses of --partial, this doesn't risk mixing old and new\n" 2172 "history since the old and new histories are in different repositories.") 2173 location = parser.add_argument_group(title=_("Location to filter from/to"), 2174 description=desc) 2175 location.add_argument('--source', type=os.fsencode, 2176 help=_("Git repository to read from")) 2177 location.add_argument('--target', type=os.fsencode, 2178 help=_("Git repository to overwrite with filtered history")) 2179 2180 order = parser.add_argument_group(title=_("Ordering of commits")) 2181 order.add_argument('--date-order', action='store_true', 2182 help=_("Processes commits in commit timestamp order.")) 2183 2184 misc = parser.add_argument_group(title=_("Miscellaneous options")) 2185 misc.add_argument('--help', '-h', action='store_true', 2186 help=_("Show this help message and exit.")) 2187 misc.add_argument('--version', action='store_true', 2188 help=_("Display filter-repo's version and exit.")) 2189 misc.add_argument('--proceed', action='store_true', 2190 help=_("Avoid triggering the no-arguments-specified check.")) 2191 misc.add_argument('--force', '-f', action='store_true', 2192 help=_("Rewrite repository history even if the current repo does not " 2193 "look like a fresh clone. History rewriting is irreversible " 2194 "(and includes immediate pruning of reflogs and old objects), " 2195 "so be cautious about using this flag.")) 2196 misc.add_argument('--partial', action='store_true', 2197 help=_("Do a partial history rewrite, resulting in the mixture of " 2198 "old and new history. This disables rewriting " 2199 "refs/remotes/origin/* to refs/heads/*, disables removing " 2200 "of the 'origin' remote, disables removing unexported refs, " 2201 "disables expiring the reflog, and disables the automatic " 2202 "post-filter gc. Also, this modifies --tag-rename and " 2203 "--refname-callback options such that instead of replacing " 2204 "old refs with new refnames, it will instead create new " 2205 "refs and keep the old ones around. Use with caution.")) 2206 misc.add_argument('--no-gc', action='store_true', 2207 help=_("Do not run 'git gc' after filtering.")) 2208 # WARNING: --refs presents a problem with become-degenerate pruning: 2209 # * Excluding a commit also excludes its ancestors so when some other 2210 # commit has an excluded ancestor as a parent we have no way of 2211 # knowing what it is an ancestor of without doing a special 2212 # full-graph walk. 2213 misc.add_argument('--refs', nargs='+', 2214 help=_("Limit history rewriting to the specified refs. Implies " 2215 "--partial. In addition to the normal caveats of --partial " 2216 "(mixing old and new history, no automatic remapping of " 2217 "refs/remotes/origin/* to refs/heads/*, etc.), this also may " 2218 "cause problems for pruning of degenerate empty merge " 2219 "commits when negative revisions are specified.")) 2220 2221 misc.add_argument('--dry-run', action='store_true', 2222 help=_("Do not change the repository. Run `git fast-export` and " 2223 "filter its output, and save both the original and the " 2224 "filtered version for comparison. This also disables " 2225 "rewriting commit messages due to not knowing new commit " 2226 "IDs and disables filtering of some empty commits due to " 2227 "inability to query the fast-import backend." )) 2228 misc.add_argument('--debug', action='store_true', 2229 help=_("Print additional information about operations being " 2230 "performed and commands being run. When used together " 2231 "with --dry-run, also show extra information about what " 2232 "would be run.")) 2233 # WARNING: --state-branch has some problems: 2234 # * It does not work well with manually inserted objects (user creating 2235 # Blob() or Commit() or Tag() objects and calling 2236 # RepoFilter.insert(obj) on them). 2237 # * It does not work well with multiple source or multiple target repos 2238 # * It doesn't work so well with pruning become-empty commits (though 2239 # --refs doesn't work so well with it either) 2240 # These are probably fixable, given some work (e.g. re-importing the 2241 # graph at the beginning to get the AncestryGraph right, doing our own 2242 # export of marks instead of using fast-export --export-marks, etc.), but 2243 # for now just hide the option. 2244 misc.add_argument('--state-branch', 2245 #help=_("Enable incremental filtering by saving the mapping of old " 2246 # "to new objects to the specified branch upon exit, and" 2247 # "loading that mapping from that branch (if it exists) " 2248 # "upon startup.")) 2249 help=argparse.SUPPRESS) 2250 misc.add_argument('--stdin', action='store_true', 2251 help=_("Instead of running `git fast-export` and filtering its " 2252 "output, filter the fast-export stream from stdin. The " 2253 "stdin must be in the expected input format (e.g. it needs " 2254 "to include original-oid directives).")) 2255 misc.add_argument('--quiet', action='store_true', 2256 help=_("Pass --quiet to other git commands called")) 2257 return parser 2258 2259 @staticmethod 2260 def sanity_check_args(args): 2261 if args.analyze and args.path_changes: 2262 raise SystemExit(_("Error: --analyze is incompatible with --path* flags; " 2263 "it's a read-only operation.")) 2264 if args.analyze and args.stdin: 2265 raise SystemExit(_("Error: --analyze is incompatible with --stdin.")) 2266 # If no path_changes are found, initialize with empty list but mark as 2267 # not inclusive so that all files match 2268 if args.path_changes == None: 2269 args.path_changes = [] 2270 args.inclusive = False 2271 else: 2272 # Similarly, if we have no filtering paths, then no path should be 2273 # filtered out. Based on how newname() works, the easiest way to 2274 # achieve that is setting args.inclusive to False. 2275 if not any(x[0] == 'filter' for x in args.path_changes): 2276 args.inclusive = False 2277 # Also check for incompatible --use-base-name and --path-rename flags. 2278 if args.use_base_name: 2279 if any(x[0] == 'rename' for x in args.path_changes): 2280 raise SystemExit(_("Error: --use-base-name and --path-rename are " 2281 "incompatible.")) 2282 # Also throw some sanity checks on git version here; 2283 # PERF: remove these checks once new enough git versions are common 2284 p = subproc.Popen('git fast-export -h'.split(), 2285 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 2286 output = p.stdout.read() 2287 if b'--anonymize-map' not in output: # pragma: no cover 2288 global date_format_permissive 2289 date_format_permissive = False 2290 if not any(x in output for x in [b'--mark-tags',b'--[no-]mark-tags']): # pragma: no cover 2291 global write_marks 2292 write_marks = False 2293 if args.state_branch: 2294 # We need a version of git-fast-export with --mark-tags 2295 raise SystemExit(_("Error: need git >= 2.24.0")) 2296 if not any(x in output for x in [b'--reencode', b'--[no-]reencode']): # pragma: no cover 2297 if args.preserve_commit_encoding: 2298 # We need a version of git-fast-export with --reencode 2299 raise SystemExit(_("Error: need git >= 2.23.0")) 2300 else: 2301 # Set args.preserve_commit_encoding to None which we'll check for later 2302 # to avoid passing --reencode=yes to fast-export (that option was the 2303 # default prior to git-2.23) 2304 args.preserve_commit_encoding = None 2305 # If we don't have fast-exoprt --reencode, we may also be missing 2306 # diff-tree --combined-all-paths, which is even more important... 2307 p = subproc.Popen('git diff-tree -h'.split(), 2308 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 2309 output = p.stdout.read() 2310 if b'--combined-all-paths' not in output: 2311 # We need a version of git-diff-tree with --combined-all-paths 2312 raise SystemExit(_("Error: need git >= 2.22.0")) 2313 if args.sensitive_data_removal: 2314 p = subproc.Popen('git cat-file -h'.split(), 2315 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 2316 output = p.stdout.read() 2317 if b"--batch-command" not in output: # pragma: no cover 2318 raise SystemExit(_("Error: need git >= 2.36.0")) 2319 # End of sanity checks on git version 2320 if args.max_blob_size: 2321 suffix = args.max_blob_size[-1] 2322 if suffix not in '1234567890': 2323 mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3} 2324 if suffix not in mult: 2325 raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than" 2326 " argument %s") 2327 % args.max_blob_size) 2328 args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix] 2329 else: 2330 args.max_blob_size = int(args.max_blob_size) 2331 if args.file_info_callback and ( 2332 args.stdin or args.blob_callback or args.filename_callback): 2333 raise SystemExit(_("Error: --file-info-callback is incompatible with " 2334 "--stdin, --blob-callback,\nand --filename-callback.")) 2335 2336 @staticmethod 2337 def get_replace_text(filename): 2338 replace_literals = [] 2339 replace_regexes = [] 2340 with open(filename, 'br') as f: 2341 for line in f: 2342 line = line.rstrip(b'\r\n') 2343 2344 # Determine the replacement 2345 replacement = FilteringOptions.default_replace_text 2346 if b'==>' in line: 2347 line, replacement = line.rsplit(b'==>', 1) 2348 2349 # See if we need to match via regex 2350 regex = None 2351 if line.startswith(b'regex:'): 2352 regex = line[6:] 2353 elif line.startswith(b'glob:'): 2354 regex = glob_to_regex(line[5:]) 2355 if regex: 2356 replace_regexes.append((re.compile(regex), replacement)) 2357 else: 2358 # Otherwise, find the literal we need to replace 2359 if line.startswith(b'literal:'): 2360 line = line[8:] 2361 if not line: 2362 continue 2363 replace_literals.append((line, replacement)) 2364 return {'literals': replace_literals, 'regexes': replace_regexes} 2365 2366 @staticmethod 2367 def get_paths_from_file(filename): 2368 new_path_changes = [] 2369 with open(filename, 'br') as f: 2370 for line in f: 2371 line = line.rstrip(b'\r\n') 2372 2373 # Skip blank lines 2374 if not line: 2375 continue 2376 # Skip comment lines 2377 if line.startswith(b'#'): 2378 continue 2379 2380 # Determine the replacement 2381 match_type, repl = 'literal', None 2382 if b'==>' in line: 2383 line, repl = line.rsplit(b'==>', 1) 2384 2385 # See if we need to match via regex 2386 match_type = 'match' # a.k.a. 'literal' 2387 if line.startswith(b'regex:'): 2388 match_type = 'regex' 2389 match = re.compile(line[6:]) 2390 elif line.startswith(b'glob:'): 2391 match_type = 'glob' 2392 match = line[5:] 2393 if repl: 2394 raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename))) 2395 else: 2396 if line.startswith(b'literal:'): 2397 match = line[8:] 2398 else: 2399 match = line 2400 if repl is not None: 2401 if match and repl and match.endswith(b'/') != repl.endswith(b'/'): 2402 raise SystemExit(_("Error: When rename directories, if OLDNAME " 2403 "and NEW_NAME are both non-empty and either " 2404 "ends with a slash then both must.")) 2405 2406 # Record the filter or rename 2407 if repl is not None: 2408 new_path_changes.append(['rename', match_type, (match, repl)]) 2409 else: 2410 new_path_changes.append(['filter', match_type, match]) 2411 if match_type == 'glob' and not match.endswith(b'*'): 2412 extension = b'*' if match.endswith(b'/') else b'/*' 2413 new_path_changes.append(['filter', match_type, match+extension]) 2414 return new_path_changes 2415 2416 @staticmethod 2417 def default_options(): 2418 return FilteringOptions.parse_args([], error_on_empty = False) 2419 2420 @staticmethod 2421 def parse_args(input_args, error_on_empty = True): 2422 parser = FilteringOptions.create_arg_parser() 2423 if not input_args and error_on_empty: 2424 parser.print_usage() 2425 raise SystemExit(_("No arguments specified.")) 2426 args = parser.parse_args(input_args) 2427 if args.help: 2428 parser.print_help() 2429 raise SystemExit() 2430 if args.paths: 2431 raise SystemExit("Error: Option `--paths` unrecognized; did you mean --path or --paths-from-file?") 2432 if args.version: 2433 GitUtils.print_my_version() 2434 raise SystemExit() 2435 FilteringOptions.sanity_check_args(args) 2436 if args.mailmap: 2437 args.mailmap = MailmapInfo(args.mailmap) 2438 if args.replace_text: 2439 args.replace_text = FilteringOptions.get_replace_text(args.replace_text) 2440 if args.replace_message: 2441 args.replace_message = FilteringOptions.get_replace_text(args.replace_message) 2442 if args.strip_blobs_with_ids: 2443 with open(args.strip_blobs_with_ids, 'br') as f: 2444 args.strip_blobs_with_ids = set(f.read().split()) 2445 else: 2446 args.strip_blobs_with_ids = set() 2447 if (args.partial or args.refs) and not args.replace_refs: 2448 args.replace_refs = 'update-no-add' 2449 args.repack = not (args.partial or args.refs or args.no_gc) 2450 if args.refs or args.source or args.target: 2451 args.partial = True 2452 if args.partial: 2453 args.no_fetch = True 2454 if not args.refs: 2455 args.refs = ['--all'] 2456 return args 2457 2458class RepoAnalyze(object): 2459 2460 # First, several helper functions for analyze_commit() 2461 2462 @staticmethod 2463 def equiv_class(stats, filename): 2464 return stats['equivalence'].get(filename, (filename,)) 2465 2466 @staticmethod 2467 def setup_equivalence_for_rename(stats, oldname, newname): 2468 # if A is renamed to B and B is renamed to C, then the user thinks of 2469 # A, B, and C as all being different names for the same 'file'. We record 2470 # this as an equivalence class: 2471 # stats['equivalence'][name] = (A,B,C) 2472 # for name being each of A, B, and C. 2473 old_tuple = stats['equivalence'].get(oldname, ()) 2474 if newname in old_tuple: 2475 return 2476 elif old_tuple: 2477 new_tuple = tuple(list(old_tuple)+[newname]) 2478 else: 2479 new_tuple = (oldname, newname) 2480 for f in new_tuple: 2481 stats['equivalence'][f] = new_tuple 2482 2483 @staticmethod 2484 def setup_or_update_rename_history(stats, commit, oldname, newname): 2485 rename_commits = stats['rename_history'].get(oldname, set()) 2486 rename_commits.add(commit) 2487 stats['rename_history'][oldname] = rename_commits 2488 2489 @staticmethod 2490 def handle_renames(stats, commit, change_types, filenames): 2491 for index, change_type in enumerate(change_types): 2492 if change_type == ord(b'R'): 2493 oldname, newname = filenames[index], filenames[-1] 2494 RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname) 2495 RepoAnalyze.setup_or_update_rename_history(stats, commit, 2496 oldname, newname) 2497 2498 @staticmethod 2499 def handle_file(stats, graph, commit, modes, shas, filenames): 2500 mode, sha, filename = modes[-1], shas[-1], filenames[-1] 2501 2502 # Figure out kind of deletions to undo for this file, and update lists 2503 # of all-names-by-sha and all-filenames 2504 delmode = 'tree_deletions' 2505 if mode != b'040000': 2506 delmode = 'file_deletions' 2507 stats['names'][sha].add(filename) 2508 stats['allnames'].add(filename) 2509 2510 # If the file (or equivalence class of files) was recorded as deleted, 2511 # clearly it isn't anymore 2512 equiv = RepoAnalyze.equiv_class(stats, filename) 2513 for f in equiv: 2514 stats[delmode].pop(f, None) 2515 2516 # If we get a modify/add for a path that was renamed, we may need to break 2517 # the equivalence class. However, if the modify/add was on a branch that 2518 # doesn't have the rename in its history, we are still okay. 2519 need_to_break_equivalence = False 2520 if equiv[-1] != filename: 2521 for rename_commit in stats['rename_history'][filename]: 2522 if graph.is_ancestor(rename_commit, commit): 2523 need_to_break_equivalence = True 2524 2525 if need_to_break_equivalence: 2526 for f in equiv: 2527 if f in stats['equivalence']: 2528 del stats['equivalence'][f] 2529 2530 @staticmethod 2531 def analyze_commit(stats, graph, commit, parents, date, file_changes): 2532 graph.add_commit_and_parents(commit, parents) 2533 for change in file_changes: 2534 modes, shas, change_types, filenames = change 2535 if len(parents) == 1 and change_types.startswith(b'R'): 2536 change_types = b'R' # remove the rename score; we don't care 2537 if modes[-1] == b'160000': 2538 continue 2539 elif modes[-1] == b'000000': 2540 # Track when files/directories are deleted 2541 for f in RepoAnalyze.equiv_class(stats, filenames[-1]): 2542 if any(x == b'040000' for x in modes[0:-1]): 2543 stats['tree_deletions'][f] = date 2544 else: 2545 stats['file_deletions'][f] = date 2546 elif change_types.strip(b'AMT') == b'': 2547 RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) 2548 elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'': 2549 RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) 2550 elif change_types.strip(b'RAMT') == b'': 2551 RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) 2552 RepoAnalyze.handle_renames(stats, commit, change_types, filenames) 2553 else: 2554 raise SystemExit(_("Unhandled change type(s): %(change_type)s " 2555 "(in commit %(commit)s)") 2556 % ({'change_type': change_types, 'commit': commit}) 2557 ) # pragma: no cover 2558 2559 @staticmethod 2560 def gather_data(args): 2561 unpacked_size, packed_size = GitUtils.get_blob_sizes() 2562 stats = {'names': collections.defaultdict(set), 2563 'allnames' : set(), 2564 'file_deletions': {}, 2565 'tree_deletions': {}, 2566 'equivalence': {}, 2567 'rename_history': collections.defaultdict(set), 2568 'unpacked_size': unpacked_size, 2569 'packed_size': packed_size, 2570 'num_commits': 0} 2571 2572 # Setup the rev-list/diff-tree process 2573 processed_commits_msg = _("Processed %d commits") 2574 commit_parse_progress = ProgressWriter() 2575 num_commits = 0 2576 cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) + 2577 ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' + 2578 ' --date=short -M -t -c --raw --combined-all-paths') 2579 dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) 2580 f = dtp.stdout 2581 line = f.readline() 2582 if not line: 2583 raise SystemExit(_("Nothing to analyze; repository is empty.")) 2584 cont = bool(line) 2585 graph = AncestryGraph() 2586 while cont: 2587 commit = line.rstrip() 2588 parents = f.readline().split() 2589 date = f.readline().rstrip() 2590 2591 # We expect a blank line next; if we get a non-blank line then 2592 # this commit modified no files and we need to move on to the next. 2593 # If there is no line, we've reached end-of-input. 2594 line = f.readline() 2595 if not line: 2596 cont = False 2597 line = line.rstrip() 2598 2599 # If we haven't reached end of input, and we got a blank line meaning 2600 # a commit that has modified files, then get the file changes associated 2601 # with this commit. 2602 file_changes = [] 2603 if cont and not line: 2604 cont = False 2605 for line in f: 2606 if not line.startswith(b':'): 2607 cont = True 2608 break 2609 n = 1+max(1, len(parents)) 2610 assert line.startswith(b':'*(n-1)) 2611 relevant = line[n-1:-1] 2612 splits = relevant.split(None, n) 2613 modes = splits[0:n] 2614 splits = splits[n].split(None, n) 2615 shas = splits[0:n] 2616 splits = splits[n].split(b'\t') 2617 change_types = splits[0] 2618 filenames = [PathQuoting.dequote(x) for x in splits[1:]] 2619 file_changes.append([modes, shas, change_types, filenames]) 2620 2621 # If someone is trying to analyze a subset of the history, make sure 2622 # to avoid dying on commits with parents that we haven't seen before 2623 if args.refs: 2624 graph.record_external_commits([p for p in parents 2625 if not p in graph.value]) 2626 2627 # Analyze this commit and update progress 2628 RepoAnalyze.analyze_commit(stats, graph, commit, parents, date, 2629 file_changes) 2630 num_commits += 1 2631 commit_parse_progress.show(processed_commits_msg % num_commits) 2632 2633 # Show the final commits processed message and record the number of commits 2634 commit_parse_progress.finish() 2635 stats['num_commits'] = num_commits 2636 2637 # Close the output, ensure rev-list|diff-tree pipeline completed successfully 2638 dtp.stdout.close() 2639 if dtp.wait(): 2640 raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover 2641 2642 return stats 2643 2644 @staticmethod 2645 def write_report(reportdir, stats): 2646 def datestr(datetimestr): 2647 return datetimestr if datetimestr else _('<present>').encode() 2648 2649 def dirnames(path): 2650 while True: 2651 path = os.path.dirname(path) 2652 yield path 2653 if path == b'': 2654 break 2655 2656 # Compute aggregate size information for paths, extensions, and dirs 2657 total_size = {'packed': 0, 'unpacked': 0} 2658 path_size = {'packed': collections.defaultdict(int), 2659 'unpacked': collections.defaultdict(int)} 2660 ext_size = {'packed': collections.defaultdict(int), 2661 'unpacked': collections.defaultdict(int)} 2662 dir_size = {'packed': collections.defaultdict(int), 2663 'unpacked': collections.defaultdict(int)} 2664 for sha in stats['names']: 2665 size = {'packed': stats['packed_size'][sha], 2666 'unpacked': stats['unpacked_size'][sha]} 2667 for which in ('packed', 'unpacked'): 2668 for name in stats['names'][sha]: 2669 total_size[which] += size[which] 2670 path_size[which][name] += size[which] 2671 basename, ext = os.path.splitext(name) 2672 ext_size[which][ext] += size[which] 2673 for dirname in dirnames(name): 2674 dir_size[which][dirname] += size[which] 2675 2676 # Determine if and when extensions and directories were deleted 2677 ext_deleted_data = {} 2678 for name in stats['allnames']: 2679 when = stats['file_deletions'].get(name, None) 2680 2681 # Update the extension 2682 basename, ext = os.path.splitext(name) 2683 if when is None: 2684 ext_deleted_data[ext] = None 2685 elif ext in ext_deleted_data: 2686 if ext_deleted_data[ext] is not None: 2687 ext_deleted_data[ext] = max(ext_deleted_data[ext], when) 2688 else: 2689 ext_deleted_data[ext] = when 2690 2691 dir_deleted_data = {} 2692 for name in dir_size['packed']: 2693 dir_deleted_data[name] = stats['tree_deletions'].get(name, None) 2694 2695 with open(os.path.join(reportdir, b"README"), 'bw') as f: 2696 # Give a basic overview of this file 2697 f.write(b"== %s ==\n" % _("Overall Statistics").encode()) 2698 f.write((" %s: %d\n" % (_("Number of commits"), 2699 stats['num_commits'])).encode()) 2700 f.write((" %s: %d\n" % (_("Number of filenames"), 2701 len(path_size['packed']))).encode()) 2702 f.write((" %s: %d\n" % (_("Number of directories"), 2703 len(dir_size['packed']))).encode()) 2704 f.write((" %s: %d\n" % (_("Number of file extensions"), 2705 len(ext_size['packed']))).encode()) 2706 f.write(b"\n") 2707 f.write((" %s: %d\n" % (_("Total unpacked size (bytes)"), 2708 total_size['unpacked'])).encode()) 2709 f.write((" %s: %d\n" % (_("Total packed size (bytes)"), 2710 total_size['packed'])).encode()) 2711 f.write(b"\n") 2712 2713 # Mention issues with the report 2714 f.write(("== %s ==\n" % _("Caveats")).encode()) 2715 f.write(("=== %s ===\n" % _("Sizes")).encode()) 2716 f.write(textwrap.dedent(_(""" 2717 Packed size represents what size your repository would be if no 2718 trees, commits, tags, or other metadata were included (though it may 2719 fail to represent de-duplication; see below). It also represents the 2720 current packing, which may be suboptimal if you haven't gc'ed for a 2721 while. 2722 2723 Unpacked size represents what size your repository would be if no 2724 trees, commits, tags, or other metadata were included AND if no 2725 files were packed; i.e., without delta-ing or compression. 2726 2727 Both unpacked and packed sizes can be slightly misleading. Deleting 2728 a blob from history not save as much space as the unpacked size, 2729 because it is obviously normally stored in packed form. Also, 2730 deleting a blob from history may not save as much space as its packed 2731 size either, because another blob could be stored as a delta against 2732 that blob, so when you remove one blob another blob's packed size may 2733 grow. 2734 2735 Also, the sum of the packed sizes can add up to more than the 2736 repository size; if the same contents appeared in the repository in 2737 multiple places, git will automatically de-dupe and store only one 2738 copy, while the way sizes are added in this analysis adds the size 2739 for each file path that has those contents. Further, if a file is 2740 ever reverted to a previous version's contents, the previous 2741 version's size will be counted multiple times in this analysis, even 2742 though git will only store it once. 2743 """)[1:]).encode()) 2744 f.write(b"\n") 2745 f.write(("=== %s ===\n" % _("Deletions")).encode()) 2746 f.write(textwrap.dedent(_(""" 2747 Whether a file is deleted is not a binary quality, since it can be 2748 deleted on some branches but still exist in others. Also, it might 2749 exist in an old tag, but have been deleted in versions newer than 2750 that. More thorough tracking could be done, including looking at 2751 merge commits where one side of history deleted and the other modified, 2752 in order to give a more holistic picture of deletions. However, that 2753 algorithm would not only be more complex to implement, it'd also be 2754 quite difficult to present and interpret by users. Since --analyze 2755 is just about getting a high-level rough picture of history, it instead 2756 implements the simplistic rule that is good enough for 98% of cases: 2757 A file is marked as deleted if the last commit in the fast-export 2758 stream that mentions the file lists it as deleted. 2759 This makes it dependent on topological ordering, but generally gives 2760 the "right" answer. 2761 """)[1:]).encode()) 2762 f.write(b"\n") 2763 f.write(("=== %s ===\n" % _("Renames")).encode()) 2764 f.write(textwrap.dedent(_(""" 2765 Renames share the same non-binary nature that deletions do, plus 2766 additional challenges: 2767 * If the renamed file is renamed again, instead of just two names for 2768 a path you can have three or more. 2769 * Rename pairs of the form (oldname, newname) that we consider to be 2770 different names of the "same file" might only be valid over certain 2771 commit ranges. For example, if a new commit reintroduces a file 2772 named oldname, then new versions of oldname aren't the "same file" 2773 anymore. We could try to portray this to the user, but it's easier 2774 for the user to just break the pairing and only report unbroken 2775 rename pairings to the user. 2776 * The ability for users to rename files differently in different 2777 branches means that our chains of renames will not necessarily be 2778 linear but may branch out. 2779 """)[1:]).encode()) 2780 f.write(b"\n") 2781 2782 # Equivalence classes for names, so if folks only want to keep a 2783 # certain set of paths, they know the old names they want to include 2784 # too. 2785 with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f: 2786 seen = set() 2787 for pathname,equiv_group in sorted(stats['equivalence'].items(), 2788 key=lambda x:(x[1], x[0])): 2789 if equiv_group in seen: 2790 continue 2791 seen.add(equiv_group) 2792 f.write(("{} ->\n ".format(decode(equiv_group[0])) + 2793 "\n ".join(decode(x) for x in equiv_group[1:]) + 2794 "\n").encode()) 2795 2796 # List directories in reverse sorted order of unpacked size 2797 with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f: 2798 msg = "=== %s ===\n" % _("Deleted directories by reverse size") 2799 f.write(msg.encode()) 2800 msg = _("Format: unpacked size, packed size, date deleted, directory name\n") 2801 f.write(msg.encode()) 2802 for dirname, size in sorted(dir_size['packed'].items(), 2803 key=lambda x:(x[1],x[0]), reverse=True): 2804 if (dir_deleted_data[dirname]): 2805 f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], 2806 size, 2807 datestr(dir_deleted_data[dirname]), 2808 dirname or _('<toplevel>').encode())) 2809 2810 with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f: 2811 f.write(("=== %s ===\n" % _("All directories by reverse size")).encode()) 2812 msg = _("Format: unpacked size, packed size, date deleted, directory name\n") 2813 f.write(msg.encode()) 2814 for dirname, size in sorted(dir_size['packed'].items(), 2815 key=lambda x:(x[1],x[0]), reverse=True): 2816 f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], 2817 size, 2818 datestr(dir_deleted_data[dirname]), 2819 dirname or _("<toplevel>").encode())) 2820 2821 # List extensions in reverse sorted order of unpacked size 2822 with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f: 2823 msg = "=== %s ===\n" % _("Deleted extensions by reverse size") 2824 f.write(msg.encode()) 2825 msg = _("Format: unpacked size, packed size, date deleted, extension name\n") 2826 f.write(msg.encode()) 2827 for extname, size in sorted(ext_size['packed'].items(), 2828 key=lambda x:(x[1],x[0]), reverse=True): 2829 if (ext_deleted_data[extname]): 2830 f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], 2831 size, 2832 datestr(ext_deleted_data[extname]), 2833 extname or _('<no extension>').encode())) 2834 2835 with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f: 2836 f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode()) 2837 msg = _("Format: unpacked size, packed size, date deleted, extension name\n") 2838 f.write(msg.encode()) 2839 for extname, size in sorted(ext_size['packed'].items(), 2840 key=lambda x:(x[1],x[0]), reverse=True): 2841 f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], 2842 size, 2843 datestr(ext_deleted_data[extname]), 2844 extname or _('<no extension>').encode())) 2845 2846 # List files in reverse sorted order of unpacked size 2847 with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f: 2848 msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size") 2849 f.write(msg.encode()) 2850 msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n") 2851 f.write(msg.encode()) 2852 for pathname, size in sorted(path_size['packed'].items(), 2853 key=lambda x:(x[1],x[0]), reverse=True): 2854 when = stats['file_deletions'].get(pathname, None) 2855 if when: 2856 f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], 2857 size, 2858 datestr(when), 2859 pathname)) 2860 2861 with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f: 2862 msg = "=== %s ===\n" % _("All paths by reverse accumulated size") 2863 f.write(msg.encode()) 2864 msg = _("Format: unpacked size, packed size, date deleted, path name\n") 2865 f.write(msg.encode()) 2866 for pathname, size in sorted(path_size['packed'].items(), 2867 key=lambda x:(x[1],x[0]), reverse=True): 2868 when = stats['file_deletions'].get(pathname, None) 2869 f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], 2870 size, 2871 datestr(when), 2872 pathname)) 2873 2874 # List of filenames and sizes in descending order 2875 with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f: 2876 f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode()) 2877 f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode()) 2878 for sha, size in sorted(stats['packed_size'].items(), 2879 key=lambda x:(x[1],x[0]), reverse=True): 2880 if sha not in stats['names']: 2881 # Some objects in the repository might not be referenced, or not 2882 # referenced by the branches/tags the user cares about; skip them. 2883 continue 2884 names_with_sha = stats['names'][sha] 2885 if len(names_with_sha) == 1: 2886 names_with_sha = names_with_sha.pop() 2887 else: 2888 names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']' 2889 f.write(b" %s %10d %10d %s\n" % (sha, 2890 stats['unpacked_size'][sha], 2891 size, 2892 names_with_sha)) 2893 2894 @staticmethod 2895 def run(args): 2896 if args.report_dir: 2897 reportdir = args.report_dir 2898 else: 2899 git_dir = GitUtils.determine_git_dir(b'.') 2900 2901 # Create the report directory as necessary 2902 results_tmp_dir = os.path.join(git_dir, b'filter-repo') 2903 if not os.path.isdir(results_tmp_dir): 2904 os.mkdir(results_tmp_dir) 2905 reportdir = os.path.join(results_tmp_dir, b"analysis") 2906 2907 if os.path.isdir(reportdir): 2908 if args.force: 2909 sys.stdout.write(_("Warning: Removing recursively: \"%s\"\n") % decode(reportdir)) 2910 shutil.rmtree(reportdir) 2911 else: 2912 sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir)) 2913 sys.exit(1) 2914 2915 os.mkdir(reportdir) 2916 2917 # Gather the data we need 2918 stats = RepoAnalyze.gather_data(args) 2919 2920 # Write the reports 2921 sys.stdout.write(_("Writing reports to \"%s\"...") % decode(reportdir)) 2922 sys.stdout.flush() 2923 RepoAnalyze.write_report(reportdir, stats) 2924 sys.stdout.write(_("done.\n")) 2925 sys.stdout.write(_("README: \"%s\"\n") % decode( os.path.join(reportdir, b"README") )) 2926 2927class FileInfoValueHelper: 2928 def __init__(self, replace_text, insert_blob_func, source_working_dir): 2929 self.data = {} 2930 self._replace_text = replace_text 2931 self._insert_blob_func = insert_blob_func 2932 cmd = ['git', 'cat-file', '--batch-command'] 2933 self._cat_file_process = subproc.Popen(cmd, 2934 stdin = subprocess.PIPE, 2935 stdout = subprocess.PIPE, 2936 cwd = source_working_dir) 2937 2938 def finalize(self): 2939 self._cat_file_process.stdin.close() 2940 self._cat_file_process.wait() 2941 2942 def get_contents_by_identifier(self, blobhash): 2943 self._cat_file_process.stdin.write(b'contents '+blobhash+b'\n') 2944 self._cat_file_process.stdin.flush() 2945 line = self._cat_file_process.stdout.readline() 2946 try: 2947 (oid, oidtype, size) = line.split() 2948 except ValueError: 2949 assert(line == blobhash+b" missing\n") 2950 return None 2951 size = int(size) # Convert e.g. b'6283' to 6283 2952 assert(oidtype == b'blob') 2953 contents_plus_newline = self._cat_file_process.stdout.read(size+1) 2954 return contents_plus_newline[:-1] # return all but the newline 2955 2956 def get_size_by_identifier(self, blobhash): 2957 self._cat_file_process.stdin.write(b'info '+blobhash+b'\n') 2958 self._cat_file_process.stdin.flush() 2959 line = self._cat_file_process.stdout.readline() 2960 (oid, oidtype, size) = line.split() 2961 size = int(size) # Convert e.g. b'6283' to 6283 2962 assert(oidtype == b'blob') 2963 return size 2964 2965 def insert_file_with_contents(self, contents): 2966 blob = Blob(contents) 2967 self._insert_blob_func(blob) 2968 return blob.id 2969 2970 def is_binary(self, contents): 2971 return b"\0" in contents[0:8192] 2972 2973 def apply_replace_text(self, contents): 2974 new_contents = contents 2975 for literal, replacement in self._replace_text['literals']: 2976 new_contents = new_contents.replace(literal, replacement) 2977 for regex, replacement in self._replace_text['regexes']: 2978 new_contents = regex.sub(replacement, new_contents) 2979 return new_contents 2980 2981class LFSObjectTracker: 2982 class LFSObjs: 2983 def __init__(self): 2984 self.id_to_object_map = {} 2985 self.objects = set() 2986 2987 def __init__(self, file_info, check_sources, check_targets): 2988 self.source_objects = LFSObjectTracker.LFSObjs() 2989 self.target_objects = LFSObjectTracker.LFSObjs() 2990 self.hash_to_object_map = {} 2991 self.file_info = file_info 2992 self.check_sources = check_sources 2993 self.check_targets = check_targets 2994 self.objects_orphaned = False 2995 2996 def _get_lfs_values(self, contents): 2997 values = {} 2998 if len(contents) > 1024: 2999 return {} 3000 for line in contents.splitlines(): 3001 try: 3002 (key, value) = line.split(b' ', 1) 3003 except ValueError: 3004 return {} 3005 if not values and key != b'version': 3006 return values 3007 values[key] = value 3008 return values 3009 3010 def check_blob_data(self, contents, fast_export_id, source): 3011 if source and not self.check_sources: 3012 return 3013 mymap = self.source_objects if source else self.target_objects 3014 lfs_object_id = self._get_lfs_values(contents).get(b'oid') 3015 if lfs_object_id: 3016 mymap.id_to_object_map[fast_export_id] = lfs_object_id 3017 3018 def check_file_change_data(self, git_id, source): 3019 if source and not self.check_sources: 3020 return 3021 mymap = self.source_objects if source else self.target_objects 3022 if isinstance(git_id, int): 3023 lfs_object_id = mymap.id_to_object_map.get(git_id) 3024 if lfs_object_id: 3025 mymap.objects.add(lfs_object_id) 3026 else: 3027 if git_id in self.hash_to_object_map: 3028 mymap.objects.add(self.hash_to_object_map[git_id]) 3029 return 3030 size = self.file_info.get_size_by_identifier(git_id) 3031 if size >= 1024: 3032 return 3033 contents = self.file_info.get_contents_by_identifier(git_id) 3034 lfs_object_id = self._get_lfs_values(contents).get(b'oid') 3035 if lfs_object_id: 3036 self.hash_to_object_map[git_id] = lfs_object_id 3037 mymap.objects.add(lfs_object_id) 3038 3039 def check_output_object(self, obj): 3040 if not self.check_targets: 3041 return 3042 if type(obj) == Blob: 3043 self.check_blob_data(obj.data, obj.id, False) 3044 elif type(obj) == Commit: 3045 for change in obj.file_changes: 3046 sys.stdout.flush() 3047 if change.type != b'M' or change.mode == b'160000': 3048 continue 3049 self.check_file_change_data(change.blob_id, False) 3050 3051 def find_all_lfs_objects_in_repo(self, repo, source): 3052 if not source: 3053 self.file_info = FileInfoValueHelper(None, None, repo) 3054 p = subproc.Popen(["git", "rev-list", "--objects", "--all"], 3055 stdout=subprocess.PIPE, stderr=subprocess.PIPE, 3056 cwd=repo) 3057 for line in p.stdout.readlines(): 3058 try: 3059 (git_oid, filename) = line.split() 3060 except ValueError: 3061 # Commit and tree objects only have oid 3062 continue 3063 3064 mymap = self.source_objects if source else self.target_objects 3065 size = self.file_info.get_size_by_identifier(git_oid) 3066 if size >= 1024: 3067 continue 3068 contents = self.file_info.get_contents_by_identifier(git_oid) 3069 lfs_object_id = self._get_lfs_values(contents).get(b'oid') 3070 if lfs_object_id: 3071 mymap.objects.add(lfs_object_id) 3072 if not source: 3073 self.file_info.finalize() 3074 3075class InputFileBackup: 3076 def __init__(self, input_file, output_file): 3077 self.input_file = input_file 3078 self.output_file = output_file 3079 3080 def close(self): 3081 self.input_file.close() 3082 self.output_file.close() 3083 3084 def read(self, size): 3085 output = self.input_file.read(size) 3086 self.output_file.write(output) 3087 return output 3088 3089 def readline(self): 3090 line = self.input_file.readline() 3091 self.output_file.write(line) 3092 return line 3093 3094class DualFileWriter: 3095 def __init__(self, file1, file2): 3096 self.file1 = file1 3097 self.file2 = file2 3098 3099 def write(self, *args): 3100 self.file1.write(*args) 3101 self.file2.write(*args) 3102 3103 def flush(self): 3104 self.file1.flush() 3105 self.file2.flush() 3106 3107 def close(self): 3108 self.file1.close() 3109 self.file2.close() 3110 3111class RepoFilter(object): 3112 def __init__(self, 3113 args, 3114 filename_callback = None, 3115 message_callback = None, 3116 name_callback = None, 3117 email_callback = None, 3118 refname_callback = None, 3119 blob_callback = None, 3120 commit_callback = None, 3121 tag_callback = None, 3122 reset_callback = None, 3123 done_callback = None, 3124 file_info_callback = None): 3125 3126 self._args = args 3127 3128 # Repo we are exporting 3129 self._repo_working_dir = None 3130 3131 # Store callbacks for acting on objects printed by FastExport 3132 self._blob_callback = blob_callback 3133 self._commit_callback = commit_callback 3134 self._tag_callback = tag_callback 3135 self._reset_callback = reset_callback 3136 self._done_callback = done_callback 3137 3138 # Store callbacks for acting on slices of FastExport objects 3139 self._filename_callback = filename_callback # filenames from commits 3140 self._message_callback = message_callback # commit OR tag message 3141 self._name_callback = name_callback # author, committer, tagger 3142 self._email_callback = email_callback # author, committer, tagger 3143 self._refname_callback = refname_callback # from commit/tag/reset 3144 self._file_info_callback = file_info_callback # various file info 3145 self._handle_arg_callbacks() 3146 3147 # Helpers for callbacks 3148 self._file_info_value = None 3149 3150 # Defaults for input 3151 self._input = None 3152 self._fep = None # Fast Export Process 3153 self._fe_orig = None # Path to where original fast-export output stored 3154 self._fe_filt = None # Path to where filtered fast-export output stored 3155 self._parser = None # FastExportParser object we are working with 3156 3157 # Defaults for output 3158 self._output = None 3159 self._fip = None # Fast Import Process 3160 self._import_pipes = None 3161 self._managed_output = True 3162 3163 # A tuple of (depth, list-of-ancestors). Commits and ancestors are 3164 # identified by their id (their 'mark' in fast-export or fast-import 3165 # speak). The depth of a commit is one more than the max depth of any 3166 # of its ancestors. 3167 self._graph = AncestryGraph() 3168 # Another one, for ancestry of commits in the original repo 3169 self._orig_graph = AncestryGraph() 3170 3171 # Names of files that were tweaked in any commit; such paths could lead 3172 # to subsequent commits being empty 3173 self._files_tweaked = set() 3174 3175 # A set of commit hash pairs (oldhash, newhash) which used to be merge 3176 # commits but due to filtering were turned into non-merge commits. 3177 # The commits probably have suboptimal commit messages (e.g. "Merge branch 3178 # next into master"). 3179 self._commits_no_longer_merges = [] 3180 3181 # A dict of original_ids to new_ids; filtering commits means getting 3182 # new commit hash (sha1sums), and we record the mapping both for 3183 # diagnostic purposes and so we can rewrite commit messages. Note that 3184 # the new_id can be None rather than a commit hash if the original 3185 # commit became empty and was pruned or was otherwise dropped. 3186 self._commit_renames = {} 3187 3188 # A set of original_ids (i.e. original hashes) for which we have not yet 3189 # gotten the new hashses; the value is always the corresponding fast-export 3190 # id (i.e. commit.id) 3191 self._pending_renames = collections.OrderedDict() 3192 3193 # A dict of commit_hash[0:7] -> set(commit_hashes with that prefix). 3194 # 3195 # It's common for commit messages to refer to commits by abbreviated 3196 # commit hashes, as short as 7 characters. To facilitate translating 3197 # such short hashes, we have a mapping of prefixes to full old hashes. 3198 self._commit_short_old_hashes = collections.defaultdict(set) 3199 3200 # A set of commit hash references appearing in commit messages which 3201 # mapped to a valid commit that was removed entirely in the filtering 3202 # process. The commit message will continue to reference the 3203 # now-missing commit hash, since there was nothing to map it to. 3204 self._commits_referenced_but_removed = set() 3205 3206 # Other vars related to metadata tracking 3207 self._already_ran = False 3208 self._changed_refs = set() 3209 self._lfs_object_tracker = None 3210 3211 # Progress handling (number of commits parsed, etc.) 3212 self._progress_writer = ProgressWriter() 3213 self._num_commits = 0 3214 3215 # Size of blobs in the repo 3216 self._unpacked_size = {} 3217 3218 # Other vars 3219 self._sanity_checks_handled = False 3220 self._finalize_handled = False 3221 self._orig_refs = None 3222 self._config_settings = {} 3223 self._newnames = {} 3224 self._stash = None 3225 3226 # Cache a few message translations for performance reasons 3227 self._parsed_message = _("Parsed %d commits") 3228 3229 # Compile some regexes and cache those 3230 self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)') 3231 3232 def _handle_arg_callbacks(self): 3233 def make_callback(args, bdy): 3234 callback_globals = {g: globals()[g] for g in public_globals} 3235 callback_locals = {} 3236 if type(args) == str: 3237 args = (args, '_do_not_use_this_var = None') 3238 exec('def callback({}):\n'.format(', '.join(args))+ 3239 ' '+'\n '.join(bdy.splitlines()), callback_globals, callback_locals) 3240 return callback_locals['callback'] 3241 def handle(which, args=None): 3242 which_under = which.replace('-','_') 3243 if not args: 3244 args = which 3245 callback_field = '_{}_callback'.format(which_under) 3246 code_string = getattr(self._args, which_under+'_callback') 3247 if code_string: 3248 if os.path.exists(code_string): 3249 with open(code_string, 'r', encoding='utf-8') as f: 3250 code_string = f.read() 3251 if getattr(self, callback_field): 3252 raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter " 3253 "AND pass --%s-callback" 3254 % (which_under, which))) 3255 if 'return ' not in code_string and \ 3256 which not in ('blob', 'commit', 'tag', 'reset'): 3257 raise SystemExit(_("Error: --%s-callback should have a return statement") 3258 % which) 3259 setattr(self, callback_field, make_callback(args, code_string)) 3260 handle('filename') 3261 handle('message') 3262 handle('name') 3263 handle('email') 3264 handle('refname') 3265 handle('blob') 3266 handle('commit') 3267 handle('tag') 3268 handle('reset') 3269 handle('file-info', ('filename', 'mode', 'blob_id', 'value')) 3270 3271 def _run_sanity_checks(self): 3272 self._sanity_checks_handled = True 3273 if not self._managed_output: 3274 if not self._args.replace_refs: 3275 # If not _managed_output we don't want to make extra changes to the 3276 # repo, so set default to no-op 'update-no-add' 3277 self._args.replace_refs = 'update-no-add' 3278 return 3279 3280 if self._args.debug: 3281 print("[DEBUG] Passed arguments:\n{}".format(self._args)) 3282 3283 # Determine basic repository information 3284 target_working_dir = self._args.target or b'.' 3285 self._orig_refs = GitUtils.get_refs(target_working_dir) 3286 is_bare = GitUtils.is_repository_bare(target_working_dir) 3287 self._config_settings = GitUtils.get_config_settings(target_working_dir) 3288 3289 # Determine if this is second or later run of filter-repo 3290 tmp_dir = self.results_tmp_dir(create_if_missing=False) 3291 ran_path = os.path.join(tmp_dir, b'already_ran') 3292 self._already_ran = os.path.isfile(ran_path) 3293 if self._already_ran: 3294 current_time = time.time() 3295 file_mod_time = os.path.getmtime(ran_path) 3296 file_age = current_time - file_mod_time 3297 if file_age > 86400: # file older than a day 3298 msg = (f"The previous run is older than a day ({decode(ran_path)} already exists).\n" 3299 f"See \"Already Ran\" section in the manual for more information.\n" 3300 f"Treat this run as a continuation of filtering in the previous run (Y/N)? ") 3301 response = input(msg) 3302 3303 if response.lower() != 'y': 3304 os.remove(ran_path) 3305 self._already_ran = False 3306 3307 # Interaction between --already-ran and --sensitive_data_removal 3308 msg = textwrap.dedent(_("""\ 3309 Error: Cannot specify --sensitive-data-removal on a follow-up invocation 3310 of git-filter-repo unless it was specified in previously runs.""")) 3311 if self._already_ran: 3312 sdr_path = os.path.join(tmp_dir, b'sensitive_data_removal') 3313 sdr_previously = os.path.isfile(sdr_path) 3314 if not sdr_previously and self._args.sensitive_data_removal: 3315 raise SystemExit(msg) 3316 # Treat this as a --sensitive-data-removal run if a previous run was, 3317 # even if it wasn't specified this time 3318 self._args.sensitive_data_removal = sdr_previously 3319 3320 # Have to check sensitive_data_removal interactions here instead of 3321 # sanity_check_args because of the above interaction with already_ran stuff 3322 if self._args.sensitive_data_removal: 3323 if self._args.stdin: 3324 msg = _("Error: sensitive data removal is incompatible with --stdin") 3325 raise SystemExit(msg) 3326 if self._args.source or self._args.target: 3327 msg = _("Error: sensitive data removal is incompatible with --source and --target") 3328 raise SystemExit(msg) 3329 3330 # Default for --replace-refs 3331 if not self._args.replace_refs: 3332 self._args.replace_refs = 'delete-no-add' 3333 if self._args.replace_refs == 'old-default': 3334 self._args.replace_refs = ('update-or-add' if self._already_ran 3335 else 'update-and-add') 3336 3337 # Do sanity checks from the correct directory 3338 if not self._args.force and not self._already_ran: 3339 cwd = os.getcwd() 3340 os.chdir(target_working_dir) 3341 RepoFilter.sanity_check(self._orig_refs, is_bare, self._config_settings) 3342 os.chdir(cwd) 3343 3344 def _setup_lfs_orphaning_checks(self): 3345 # Do a couple checks to see if we want to do lfs orphaning checks 3346 if not self._args.sensitive_data_removal: 3347 return 3348 metadata_dir = self.results_tmp_dir() 3349 lfs_objects_file = os.path.join(metadata_dir, b'original_lfs_objects') 3350 if self._already_ran: 3351 # Check if we did lfs filtering in the previous run 3352 if not os.path.isfile(lfs_objects_file): 3353 return 3354 3355 # Set up self._file_info_value so we can query git for stuff 3356 source_working_dir = self._args.source or b'.' 3357 self._file_info_value = FileInfoValueHelper(self._args.replace_text, 3358 self.insert, 3359 source_working_dir) 3360 3361 # One more check to see if we want to do lfs orphaning checks 3362 if not self._already_ran: 3363 # Check if lfs filtering is active in HEAD's .gitattributes file 3364 a = self._file_info_value.get_contents_by_identifier(b"HEAD:.gitattributes") 3365 if not a or not re.search(rb'\bfilter=lfs\b', a): 3366 return 3367 3368 # Set up the object tracker 3369 check_sources = not self._already_ran and not self._args.partial 3370 check_targets = not self._args.partial 3371 self._lfs_object_tracker = LFSObjectTracker(self._file_info_value, 3372 check_sources, 3373 check_targets) 3374 self._parser._lfs_object_tracker = self._lfs_object_tracker # kinda gross 3375 3376 # Get initial objects 3377 if self._already_ran: 3378 with open(lfs_objects_file, 'br') as f: 3379 for line in f: 3380 self._lfs_object_tracker.source_objects.objects.add(line.strip()) 3381 elif self._args.partial: 3382 source = True 3383 self._lfs_object_tracker.find_all_lfs_objects_in_repo(source_working_dir, 3384 source) 3385 3386 @staticmethod 3387 def loose_objects_are_replace_refs(git_dir, refs, num_loose_objects): 3388 replace_objects = set() 3389 for refname, rev in refs.items(): 3390 if not refname.startswith(b'refs/replace/'): 3391 continue 3392 replace_objects.add(rev) 3393 3394 validobj_re = re.compile(rb'^[0-9a-f]{40}$') 3395 object_dir=os.path.join(git_dir, b'objects') 3396 for root, dirs, files in os.walk(object_dir): 3397 for filename in files: 3398 objname = os.path.basename(root)+filename 3399 if objname not in replace_objects and validobj_re.match(objname): 3400 return False 3401 3402 return True 3403 3404 @staticmethod 3405 def sanity_check(refs, is_bare, config_settings): 3406 def abort(reason): 3407 dirname = config_settings.get(b'remote.origin.url', b'') 3408 msg = "" 3409 if dirname and os.path.isdir(dirname): 3410 msg = _("Note: when cloning local repositories, you need to pass\n" 3411 " --no-local to git clone to avoid this issue.\n") 3412 raise SystemExit( 3413 _("Aborting: Refusing to destructively overwrite repo history since\n" 3414 "this does not look like a fresh clone.\n" 3415 " (%s)\n%s" 3416 "Please operate on a fresh clone instead. If you want to proceed\n" 3417 "anyway, use --force.") % (reason, msg)) 3418 3419 # Avoid letting people running with weird setups and overwriting GIT_DIR 3420 # elsewhere 3421 git_dir = GitUtils.determine_git_dir(b'.') 3422 if is_bare and git_dir != b'.': 3423 abort(_("GIT_DIR must be .")) 3424 elif not is_bare and git_dir != b'.git': 3425 abort(_("GIT_DIR must be .git")) 3426 3427 # Check for refname collisions 3428 if config_settings.get(b'core.ignorecase', b'false') == b'true': 3429 collisions = collections.defaultdict(list) 3430 for ref in refs: 3431 collisions[ref.lower()].append(ref) 3432 msg = "" 3433 for ref in collisions: 3434 if len(collisions[ref]) >= 2: 3435 msg += " " + decode(b", ".join(collisions[ref])) + "\n" 3436 if msg: 3437 raise SystemExit( 3438 _("Aborting: Cannot rewrite history on a case insensitive\n" 3439 "filesystem since you have refs that differ in case only:\n" 3440 "%s") % msg) 3441 if config_settings.get(b'core.precomposeunicode', b'false') == b'true': 3442 import unicodedata # Mac users need to have python-3.8 3443 collisions = collections.defaultdict(list) 3444 for ref in refs: 3445 strref = decode(ref) 3446 collisions[unicodedata.normalize('NFC', strref)].append(strref) 3447 msg = "" 3448 for ref in collisions: 3449 if len(collisions[ref]) >= 2: 3450 msg += " " + ", ".join(collisions[ref]) + "\n" 3451 if msg: 3452 raise SystemExit( 3453 _("Aborting: Cannot rewrite history on a character normalizing\n" 3454 "filesystem since you have refs that differ in normalization:\n" 3455 "%s") % msg) 3456 3457 # Make sure repo is fully packed, just like a fresh clone would be. 3458 # Note that transfer.unpackLimit defaults to 100, meaning that a 3459 # repository with no packs and less than 100 objects should be considered 3460 # fully packed. 3461 output = subproc.check_output('git count-objects -v'.split()) 3462 stats = dict(x.split(b': ') for x in output.splitlines()) 3463 num_packs = int(stats[b'packs']) 3464 num_loose_objects = int(stats[b'count']) 3465 if num_packs > 1 or \ 3466 num_loose_objects >= 100 or \ 3467 (num_packs == 1 and num_loose_objects > 0 and 3468 not RepoFilter.loose_objects_are_replace_refs(git_dir, refs, 3469 num_loose_objects)): 3470 abort(_("expected freshly packed repo")) 3471 3472 # Make sure there is precisely one remote, named "origin"...or that this 3473 # is a new bare repo with no packs and no remotes 3474 output = subproc.check_output('git remote'.split()).strip() 3475 if not (output == b"origin" or (num_packs == 0 and not output)): 3476 abort(_("expected one remote, origin")) 3477 3478 # Make sure that all reflogs have precisely one entry 3479 reflog_dir=os.path.join(git_dir, b'logs') 3480 for root, dirs, files in os.walk(reflog_dir): 3481 for filename in files: 3482 pathname = os.path.join(root, filename) 3483 with open(pathname, 'br') as f: 3484 if len(f.read().splitlines()) > 1: 3485 shortpath = pathname[len(reflog_dir)+1:] 3486 abort(_("expected at most one entry in the reflog for %s") % 3487 decode(shortpath)) 3488 3489 # Make sure there are no stashed changes 3490 if b'refs/stash' in refs: 3491 abort(_("has stashed changes")) 3492 3493 # Do extra checks in non-bare repos 3494 if not is_bare: 3495 # Avoid uncommitted, unstaged, or untracked changes 3496 if subproc.call('git diff --staged --quiet'.split()): 3497 abort(_("you have uncommitted changes")) 3498 if subproc.call('git diff --quiet'.split()): 3499 abort(_("you have unstaged changes")) 3500 untracked_output = subproc.check_output('git ls-files -o'.split()) 3501 if len(untracked_output) > 0: 3502 uf = untracked_output.rstrip(b'\n').split(b'\n') 3503 # Since running git-filter-repo can result in files being written to 3504 # __pycache__ (depending on python version, env vars, etc.), let's 3505 # ignore those as far as "clean clone" is concerned. 3506 relevant_uf = [x for x in uf 3507 if not x.startswith(b'__pycache__/git_filter_repo.')] 3508 if len(relevant_uf) > 0: 3509 abort(_("you have untracked changes")) 3510 3511 # Avoid unpushed changes 3512 for refname, rev in refs.items(): 3513 if not refname.startswith(b'refs/heads/'): 3514 continue 3515 origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/') 3516 if origin_ref not in refs: 3517 abort(_('%s exists, but %s not found') % (decode(refname), 3518 decode(origin_ref))) 3519 if rev != refs[origin_ref]: 3520 abort(_('%s does not match %s') % (decode(refname), 3521 decode(origin_ref))) 3522 3523 # Make sure there is only one worktree 3524 output = subproc.check_output('git worktree list'.split()) 3525 if len(output.splitlines()) > 1: 3526 abort(_('you have multiple worktrees')) 3527 3528 def cleanup(self, repo, repack, reset, 3529 run_quietly=False, show_debuginfo=False): 3530 ''' cleanup repo; if repack then expire reflogs and do a gc --prune=now. 3531 if reset then do a reset --hard. Optionally also curb output if 3532 run_quietly is True, or go the opposite direction and show extra 3533 output if show_debuginfo is True. ''' 3534 assert not (run_quietly and show_debuginfo) 3535 3536 if (repack and not run_quietly and not show_debuginfo): 3537 print(_("Repacking your repo and cleaning out old unneeded objects")) 3538 quiet_flags = '--quiet' if run_quietly else '' 3539 cleanup_cmds = [] 3540 if repack: 3541 cleanup_cmds = ['git reflog expire --expire=now --all'.split(), 3542 'git gc {} --prune=now'.format(quiet_flags).split()] 3543 if reset: 3544 cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split()) 3545 location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else '' 3546 for cmd in cleanup_cmds: 3547 if show_debuginfo: 3548 print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd))) 3549 ret = subproc.call(cmd, cwd=repo) 3550 if ret != 0: 3551 raise SystemExit("fatal: running '%s' failed!" % ' '.join(cmd)) 3552 if cmd[0:3] == 'git reflog expire'.split(): 3553 self._write_stash() 3554 3555 def _get_rename(self, old_hash): 3556 # If we already know the rename, just return it 3557 new_hash = self._commit_renames.get(old_hash, None) 3558 if new_hash: 3559 return new_hash 3560 3561 # If it's not in the remaining pending renames, we don't know it 3562 if old_hash is not None and old_hash not in self._pending_renames: 3563 return None 3564 3565 # Read through the pending renames until we find it or we've read them all, 3566 # and return whatever we might find 3567 self._flush_renames(old_hash) 3568 return self._commit_renames.get(old_hash, None) 3569 3570 def _flush_renames(self, old_hash=None, limit=0): 3571 # Parse through self._pending_renames until we have read enough. We have 3572 # read enough if: 3573 # self._pending_renames is empty 3574 # old_hash != None and we found a rename for old_hash 3575 # limit > 0 and len(self._pending_renames) started less than 2*limit 3576 # limit > 0 and len(self._pending_renames) < limit 3577 if limit and len(self._pending_renames) < 2 * limit: 3578 return 3579 fi_input, fi_output = self._import_pipes 3580 while self._pending_renames: 3581 orig_hash, new_fast_export_id = self._pending_renames.popitem(last=False) 3582 new_hash = fi_output.readline().rstrip() 3583 self._commit_renames[orig_hash] = new_hash 3584 self._graph.record_hash(new_fast_export_id, new_hash) 3585 if old_hash == orig_hash: 3586 return 3587 if limit and len(self._pending_renames) < limit: 3588 return 3589 3590 def _translate_commit_hash(self, matchobj_or_oldhash): 3591 old_hash = matchobj_or_oldhash 3592 if not isinstance(matchobj_or_oldhash, bytes): 3593 old_hash = matchobj_or_oldhash.group(1) 3594 orig_len = len(old_hash) 3595 new_hash = self._get_rename(old_hash) 3596 if new_hash is None: 3597 if old_hash[0:7] not in self._commit_short_old_hashes: 3598 self._commits_referenced_but_removed.add(old_hash) 3599 return old_hash 3600 possibilities = self._commit_short_old_hashes[old_hash[0:7]] 3601 matches = [x for x in possibilities 3602 if x[0:orig_len] == old_hash] 3603 if len(matches) != 1: 3604 self._commits_referenced_but_removed.add(old_hash) 3605 return old_hash 3606 old_hash = matches[0] 3607 new_hash = self._get_rename(old_hash) 3608 3609 assert new_hash is not None 3610 return new_hash[0:orig_len] 3611 3612 def _maybe_trim_extra_parents(self, orig_parents, parents): 3613 '''Due to pruning of empty commits, some parents could be non-existent 3614 (None) or otherwise redundant. Remove the non-existent parents, and 3615 remove redundant parents ***SO LONG AS*** that doesn't transform a 3616 merge commit into a non-merge commit. 3617 3618 Returns a tuple: 3619 (parents, new_first_parent_if_would_become_non_merge)''' 3620 3621 always_prune = (self._args.prune_degenerate == 'always') 3622 3623 # Pruning of empty commits means multiple things: 3624 # * An original parent of this commit may have been pruned causing the 3625 # need to rewrite the reported parent to the nearest ancestor. We 3626 # want to know when we're dealing with such a parent. 3627 # * Further, there may be no "nearest ancestor" if the entire history 3628 # of that parent was also pruned. (Detectable by the parent being 3629 # 'None') 3630 # Remove all parents rewritten to None, and keep track of which parents 3631 # were rewritten to an ancestor. 3632 tmp = zip(parents, 3633 orig_parents, 3634 [(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents]) 3635 tmp2 = [x for x in tmp if x[0] is not None] 3636 if not tmp2: 3637 # All ancestors have been pruned; we have no parents. 3638 return [], None 3639 parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)] 3640 3641 # We can't have redundant parents if we don't have at least 2 parents 3642 if len(parents) < 2: 3643 return parents, None 3644 3645 # Don't remove redundant parents if user doesn't want us to 3646 if self._args.prune_degenerate == 'never': 3647 return parents, None 3648 3649 # Remove duplicate parents (if both sides of history have lots of commits 3650 # which become empty due to pruning, the most recent ancestor on both 3651 # sides may be the same commit), except only remove parents that have 3652 # been rewritten due to previous empty pruning. 3653 seen = set() 3654 seen_add = seen.add 3655 # Deleting duplicate rewritten parents means keeping parents if either 3656 # they have not been seen or they are ones that have not been rewritten. 3657 parents_copy = parents 3658 uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents) 3659 if not (p in seen or seen_add(p)) or not is_rewritten[i]] 3660 parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)] 3661 if len(parents) < 2: 3662 return parents_copy, parents[0] 3663 3664 # Flatten unnecessary merges. (If one side of history is entirely 3665 # empty commits that were pruned, we may end up attempting to 3666 # merge a commit with its ancestor. Remove parents that are an 3667 # ancestor of another parent.) 3668 num_parents = len(parents) 3669 to_remove = [] 3670 for cur in range(num_parents): 3671 if not is_rewritten[cur]: 3672 continue 3673 for other in range(num_parents): 3674 if cur == other: 3675 continue 3676 if not self._graph.is_ancestor(parents[cur], parents[other]): 3677 continue 3678 # parents[cur] is an ancestor of parents[other], so parents[cur] 3679 # seems redundant. However, if it was intentionally redundant 3680 # (e.g. a no-ff merge) in the original, then we want to keep it. 3681 if not always_prune and \ 3682 self._orig_graph.is_ancestor(orig_parents[cur], 3683 orig_parents[other]): 3684 continue 3685 # Some folks want their history to have all first parents be merge 3686 # commits (except for any root commits), and always do a merge --no-ff. 3687 # For such folks, don't remove the first parent even if it's an 3688 # ancestor of other commits. 3689 if self._args.no_ff and cur == 0: 3690 continue 3691 # Okay so the cur-th parent is an ancestor of the other-th parent, 3692 # and it wasn't that way in the original repository; mark the 3693 # cur-th parent as removable. 3694 to_remove.append(cur) 3695 break # cur removed, so skip rest of others -- i.e. check cur+=1 3696 for x in reversed(to_remove): 3697 parents.pop(x) 3698 if len(parents) < 2: 3699 return parents_copy, parents[0] 3700 3701 return parents, None 3702 3703 def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents): 3704 parents = commit.parents 3705 3706 if self._args.prune_empty == 'never': 3707 return False 3708 always_prune = (self._args.prune_empty == 'always') 3709 3710 # For merge commits, unless there are prunable (redundant) parents, we 3711 # do not want to prune 3712 if len(parents) >= 2 and not new_1st_parent: 3713 return False 3714 3715 if len(parents) < 2: 3716 # Special logic for commits that started empty... 3717 if not had_file_changes and not always_prune: 3718 had_parents_pruned = (len(parents) < len(orig_parents) or 3719 (len(orig_parents) == 1 and 3720 orig_parents[0] in _SKIPPED_COMMITS)) 3721 # If the commit remains empty and had parents which were pruned, 3722 # then prune this commit; otherwise, retain it 3723 return (not commit.file_changes and had_parents_pruned) 3724 3725 # We can only get here if the commit didn't start empty, so if it's 3726 # empty now, it obviously became empty 3727 if not commit.file_changes: 3728 return True 3729 3730 # If there are no parents of this commit and we didn't match the case 3731 # above, then this commit cannot be pruned. Since we have no parent(s) 3732 # to compare to, abort now to prevent future checks from failing. 3733 if not parents: 3734 return False 3735 3736 # Similarly, we cannot handle the hard cases if we don't have a pipe 3737 # to communicate with fast-import 3738 if not self._import_pipes: 3739 return False 3740 3741 # If there have not been renames/remappings of IDs (due to insertion of 3742 # new blobs), then we can sometimes know things aren't prunable with a 3743 # simple check 3744 if not _IDS.has_renames(): 3745 # non-merge commits can only be empty if blob/file-change editing caused 3746 # all file changes in the commit to have the same file contents as 3747 # the parent. 3748 changed_files = set(change.filename for change in commit.file_changes) 3749 if len(orig_parents) < 2 and changed_files - self._files_tweaked: 3750 return False 3751 3752 # Finally, the hard case: due to either blob rewriting, or due to pruning 3753 # of empty commits wiping out the first parent history back to the merge 3754 # base, the list of file_changes we have may not actually differ from our 3755 # (new) first parent's version of the files, i.e. this would actually be 3756 # an empty commit. Check by comparing the contents of this commit to its 3757 # (remaining) parent. 3758 # 3759 # NOTE on why this works, for the case of original first parent history 3760 # having been pruned away due to being empty: 3761 # The first parent history having been pruned away due to being 3762 # empty implies the original first parent would have a tree (after 3763 # filtering) that matched the merge base's tree. Since 3764 # file_changes has the changes needed to go from what would have 3765 # been the first parent to our new commit, and what would have been 3766 # our first parent has a tree that matches the merge base, then if 3767 # the new first parent has a tree matching the versions of files in 3768 # file_changes, then this new commit is empty and thus prunable. 3769 fi_input, fi_output = self._import_pipes 3770 self._flush_renames() # Avoid fi_output having other stuff present 3771 # Optimization note: we could have two loops over file_changes, the 3772 # first doing all the self._output.write() calls, and the second doing 3773 # the rest. But I'm worried about fast-import blocking on fi_output 3774 # buffers filling up so I instead read from it as I go. 3775 for change in commit.file_changes: 3776 parent = new_1st_parent or commit.parents[0] # exists due to above checks 3777 quoted_filename = PathQuoting.enquote(change.filename) 3778 if isinstance(parent, int): 3779 self._output.write(b"ls :%d %s\n" % (parent, quoted_filename)) 3780 else: 3781 self._output.write(b"ls %s %s\n" % (parent, quoted_filename)) 3782 self._output.flush() 3783 parent_version = fi_output.readline().split() 3784 if change.type == b'D': 3785 if parent_version != [b'missing', quoted_filename]: 3786 return False 3787 else: 3788 blob_sha = change.blob_id 3789 if isinstance(change.blob_id, int): 3790 self._output.write(b"get-mark :%d\n" % change.blob_id) 3791 self._output.flush() 3792 blob_sha = fi_output.readline().rstrip() 3793 if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]: 3794 return False 3795 3796 return True 3797 3798 def _record_remapping(self, commit, orig_parents): 3799 new_id = None 3800 # Record the mapping of old commit hash to new one 3801 if commit.original_id and self._import_pipes: 3802 fi_input, fi_output = self._import_pipes 3803 self._output.write(b"get-mark :%d\n" % commit.id) 3804 self._output.flush() 3805 orig_id = commit.original_id 3806 self._commit_short_old_hashes[orig_id[0:7]].add(orig_id) 3807 # Note that we have queued up an id for later reading; flush a 3808 # few of the older ones if we have too many queued up 3809 self._pending_renames[orig_id] = commit.id 3810 self._flush_renames(None, limit=40) 3811 # Also, record if this was a merge commit that turned into a non-merge 3812 # commit. 3813 if len(orig_parents) >= 2 and len(commit.parents) < 2: 3814 self._commits_no_longer_merges.append((commit.original_id, new_id)) 3815 3816 def callback_metadata(self, extra_items = dict()): 3817 return {'commit_rename_func': self._translate_commit_hash, 3818 'ancestry_graph': self._graph, 3819 'original_ancestry_graph': self._orig_graph, 3820 **extra_items} 3821 3822 def _tweak_blob(self, blob): 3823 if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size: 3824 blob.skip() 3825 3826 if blob.original_id in self._args.strip_blobs_with_ids: 3827 blob.skip() 3828 3829 if ( self._args.replace_text 3830 and not self._file_info_callback 3831 # not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data) 3832 and not b"\0" in blob.data[0:8192] 3833 ): 3834 for literal, replacement in self._args.replace_text['literals']: 3835 blob.data = blob.data.replace(literal, replacement) 3836 for regex, replacement in self._args.replace_text['regexes']: 3837 blob.data = regex.sub(replacement, blob.data) 3838 3839 if self._blob_callback: 3840 self._blob_callback(blob, self.callback_metadata()) 3841 3842 self._insert_into_stream(blob) 3843 3844 def _filter_files(self, commit): 3845 def filename_matches(path_expression, pathname): 3846 ''' Returns whether path_expression matches pathname or a leading 3847 directory thereof, allowing path_expression to not have a trailing 3848 slash even if it is meant to match a leading directory. ''' 3849 if path_expression == b'': 3850 return True 3851 n = len(path_expression) 3852 if (pathname.startswith(path_expression) and 3853 (path_expression[n-1:n] == b'/' or 3854 len(pathname) == n or 3855 pathname[n:n+1] == b'/')): 3856 return True 3857 return False 3858 3859 def newname(path_changes, pathname, use_base_name, filtering_is_inclusive): 3860 ''' Applies filtering and rename changes from path_changes to pathname, 3861 returning any of None (file isn't wanted), original filename (file 3862 is wanted with original name), or new filename. ''' 3863 wanted = False 3864 full_pathname = pathname 3865 if use_base_name: 3866 pathname = os.path.basename(pathname) 3867 for (mod_type, match_type, path_exp) in path_changes: 3868 if mod_type == 'filter' and not wanted: 3869 assert match_type in ('match', 'glob', 'regex') 3870 if match_type == 'match' and filename_matches(path_exp, pathname): 3871 wanted = True 3872 if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp): 3873 wanted = True 3874 if match_type == 'regex' and path_exp.search(pathname): 3875 wanted = True 3876 elif mod_type == 'rename': 3877 match, repl = path_exp 3878 assert match_type in ('match','regex') # glob was translated to regex 3879 if match_type == 'match' and filename_matches(match, full_pathname): 3880 full_pathname = full_pathname.replace(match, repl, 1) 3881 pathname = full_pathname # rename incompatible with use_base_name 3882 if match_type == 'regex': 3883 full_pathname = match.sub(repl, full_pathname) 3884 pathname = full_pathname # rename incompatible with use_base_name 3885 return full_pathname if (wanted == filtering_is_inclusive) else None 3886 3887 args = self._args 3888 new_file_changes = {} # Assumes no renames or copies, otherwise collisions 3889 for change in commit.file_changes: 3890 # NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and 3891 # parse that output, we'll need to modify this block; `--full-tree` 3892 # issues a deleteall directive which has no filename, and thus this 3893 # block would normally strip it. Of course, FileChange() and 3894 # _parse_optional_filechange() would need updates too. 3895 if change.type == b'DELETEALL': 3896 new_file_changes[b''] = change 3897 continue 3898 if change.filename in self._newnames: 3899 change.filename = self._newnames[change.filename] 3900 else: 3901 original_filename = change.filename 3902 change.filename = newname(args.path_changes, change.filename, 3903 args.use_base_name, args.inclusive) 3904 if self._filename_callback: 3905 change.filename = self._filename_callback(change.filename) 3906 self._newnames[original_filename] = change.filename 3907 if not change.filename: 3908 continue # Filtering criteria excluded this file; move on to next one 3909 if change.filename in new_file_changes: 3910 # Getting here means that path renaming is in effect, and caused one 3911 # path to collide with another. That's usually bad, but can be okay 3912 # under two circumstances: 3913 # 1) Sometimes people have a file named OLDFILE in old revisions of 3914 # history, and they rename to NEWFILE, and would like to rewrite 3915 # history so that all revisions refer to it as NEWFILE. As such, 3916 # we can allow a collision when (at least) one of the two paths 3917 # is a deletion. Note that if OLDFILE and NEWFILE are unrelated 3918 # this also allows the rewrite to continue, which makes sense 3919 # since OLDFILE is no longer in the way. 3920 # 2) If OLDFILE and NEWFILE are exactly equal, then writing them 3921 # both to the same location poses no problem; we only need one 3922 # file. (This could come up if someone copied a file in some 3923 # commit, then later either deleted the file or kept it exactly 3924 # in sync with the original with any changes, and then decides 3925 # they want to rewrite history to only have one of the two files) 3926 colliding_change = new_file_changes[change.filename] 3927 if change.type == b'D': 3928 # We can just throw this one away and keep the other 3929 continue 3930 elif change.type == b'M' and ( 3931 change.mode == colliding_change.mode and 3932 change.blob_id == colliding_change.blob_id): 3933 # The two are identical, so we can throw this one away and keep other 3934 continue 3935 elif new_file_changes[change.filename].type != b'D': 3936 raise SystemExit(_("File renaming caused colliding pathnames!\n") + 3937 _(" Commit: {}\n").format(commit.original_id) + 3938 _(" Filename: {}").format(change.filename)) 3939 # Strip files that are too large 3940 if self._args.max_blob_size and \ 3941 self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size: 3942 continue 3943 if self._args.strip_blobs_with_ids and \ 3944 change.blob_id in self._args.strip_blobs_with_ids: 3945 continue 3946 # Otherwise, record the change 3947 new_file_changes[change.filename] = change 3948 commit.file_changes = [v for k,v in sorted(new_file_changes.items())] 3949 3950 def _tweak_commit(self, commit, aux_info): 3951 if self._args.replace_message: 3952 for literal, replacement in self._args.replace_message['literals']: 3953 commit.message = commit.message.replace(literal, replacement) 3954 for regex, replacement in self._args.replace_message['regexes']: 3955 commit.message = regex.sub(replacement, commit.message) 3956 if self._message_callback: 3957 commit.message = self._message_callback(commit.message) 3958 3959 # Change the commit message according to callback 3960 if not self._args.preserve_commit_hashes: 3961 commit.message = self._hash_re.sub(self._translate_commit_hash, 3962 commit.message) 3963 3964 # Change the author & committer according to mailmap rules 3965 args = self._args 3966 if args.mailmap: 3967 commit.author_name, commit.author_email = \ 3968 args.mailmap.translate(commit.author_name, commit.author_email) 3969 commit.committer_name, commit.committer_email = \ 3970 args.mailmap.translate(commit.committer_name, commit.committer_email) 3971 # Change author & committer according to callbacks 3972 if self._name_callback: 3973 commit.author_name = self._name_callback(commit.author_name) 3974 commit.committer_name = self._name_callback(commit.committer_name) 3975 if self._email_callback: 3976 commit.author_email = self._email_callback(commit.author_email) 3977 commit.committer_email = self._email_callback(commit.committer_email) 3978 3979 # Sometimes the 'branch' given is a tag; if so, rename it as requested so 3980 # we don't get any old tagnames 3981 if self._args.tag_rename: 3982 commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch) 3983 if self._refname_callback: 3984 commit.branch = self._refname_callback(commit.branch) 3985 3986 # Filter or rename the list of file changes 3987 orig_file_changes = set(commit.file_changes) 3988 self._filter_files(commit) 3989 3990 # Record ancestry graph 3991 parents, orig_parents = commit.parents, aux_info['orig_parents'] 3992 if self._args.state_branch: 3993 external_parents = parents 3994 else: 3995 external_parents = [p for p in parents if not isinstance(p, int)] 3996 # The use of 'reversed' is intentional here; there is a risk that we have 3997 # duplicates in parents, and we want to map from parents to the first 3998 # entry we find in orig_parents in such cases. 3999 parent_reverse_dict = dict(zip(reversed(parents), reversed(orig_parents))) 4000 4001 self._graph.record_external_commits(external_parents) 4002 self._orig_graph.record_external_commits(external_parents) 4003 self._graph.add_commit_and_parents(commit.id, parents) # new githash unknown 4004 self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents, 4005 commit.original_id) 4006 4007 # Prune parents (due to pruning of empty commits) if relevant, note that 4008 # new_1st_parent is None unless this was a merge commit that is becoming 4009 # a non-merge 4010 prev_1st_parent = parents[0] if parents else None 4011 parents, new_1st_parent = self._maybe_trim_extra_parents(orig_parents, 4012 parents) 4013 commit.parents = parents 4014 4015 # If parents were pruned, then we need our file changes to be relative 4016 # to the new first parent 4017 # 4018 # Notes: 4019 # * new_1st_parent and new_1st_parent != parents[0] uniquely happens for example when: 4020 # working on merge, selecting subset of files and merge base still 4021 # valid while first parent history doesn't touch any of those paths, 4022 # but second parent history does. prev_1st_parent had already been 4023 # rewritten to the non-None first ancestor and it remains valid. 4024 # self._maybe_trim_extra_parents() avoids removing this first parent 4025 # because it'd make the commit a non-merge. However, if there are 4026 # no file_changes of note, we'll drop this commit and mark 4027 # new_1st_parent as the new replacement. To correctly determine if 4028 # there are no file_changes of note, we need to have the list of 4029 # file_changes relative to new_1st_parent. 4030 # (See t9390#3, "basic -> basic-ten using '--path ten'") 4031 # * prev_1st_parent != parents[0] happens for example when: 4032 # similar to above, but the merge base is no longer valid and was 4033 # pruned away as well. Then parents started as e.g. [None, $num], 4034 # and both prev_1st_parent and new_1st_parent are None, while parents 4035 # after self._maybe_trim_extra_parents() becomes just [$num]. 4036 # (See t9390#67, "degenerate merge with non-matching filename".) 4037 # Since $num was originally a second parent, we need to rewrite 4038 # file changes to be relative to parents[0]. 4039 # * TODO: We should be getting the changes relative to the new first 4040 # parent even if self._fep is None, BUT we can't. Our method of 4041 # getting the changes right now is an external git diff invocation, 4042 # which we can't do if we just have a fast export stream. We can't 4043 # really work around it by querying the fast-import stream either, 4044 # because the 'ls' directive only allows us to list info about 4045 # specific paths, but we need to find out which paths exist in two 4046 # commits and then query them. We could maybe force checkpointing in 4047 # fast-import, then doing a diff from what'll be the new first parent 4048 # back to prev_1st_parent (which may be None, i.e. empty tree), using 4049 # the fact that in A->{B,C}->D, where D is merge of B & C, the diff 4050 # from C->D == C->A + A->B + B->D, and in these cases A==B, so it 4051 # simplifies to C->D == C->A + B->D, and C is our new 1st parent 4052 # commit, A is prev_1st_commit, and B->D is commit.file_changes that 4053 # we already have. However, checkpointing the fast-import process 4054 # and figuring out how long to wait before we can run our diff just 4055 # seems excessive. For now, just punt and assume the merge wasn't 4056 # "evil" (i.e. that it's remerge-diff is empty, as is true for most 4057 # merges). If the merge isn't evil, no further steps are necessary. 4058 if parents and self._fep and ( 4059 prev_1st_parent != parents[0] or 4060 new_1st_parent and new_1st_parent != parents[0]): 4061 # Get the id from the original fast export stream corresponding to the 4062 # new 1st parent. As noted above, that new 1st parent might be 4063 # new_1st_parent, or if that is None, it'll be parents[0]. 4064 will_be_1st = new_1st_parent or parents[0] 4065 old_id = parent_reverse_dict[will_be_1st] 4066 # Now, translate that to a hash 4067 will_be_1st_commit_hash = self._orig_graph.map_to_hash(old_id) 4068 # Get the changes from what is going to be the new 1st parent to this 4069 # merge commit. Note that since we are going from the new 1st parent 4070 # to the merge commit, we can just replace the existing 4071 # commit.file_changes rather than getting something we need to combine 4072 # with the existing commit.file_changes. Also, we can just replace 4073 # because prev_1st_parent is an ancestor of will_be_1st_commit_hash 4074 # (or prev_1st_parent is None and first parent history is gone), so 4075 # even if we retain prev_1st_parent and do not prune it, the changes 4076 # will still work given the snapshot-based way fast-export/fast-import 4077 # work. 4078 commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir, 4079 will_be_1st_commit_hash, 4080 commit.original_id) 4081 4082 # Save these and filter them 4083 orig_file_changes = set(commit.file_changes) 4084 self._filter_files(commit) 4085 4086 # Process the --file-info-callback 4087 if self._file_info_callback: 4088 if self._file_info_value is None: 4089 source_working_dir = self._args.source or b'.' 4090 self._file_info_value = FileInfoValueHelper(self._args.replace_text, 4091 self.insert, 4092 source_working_dir) 4093 new_file_changes = [] 4094 for change in commit.file_changes: 4095 if change.type != b'D': 4096 assert(change.type == b'M') 4097 (filename, mode, blob_id) = \ 4098 self._file_info_callback(change.filename, 4099 change.mode, 4100 change.blob_id, 4101 self._file_info_value) 4102 if mode is None: 4103 # TODO: Should deletion of the file even be a feature? Might 4104 # want to remove this branch of the if-elif-else. 4105 assert(filename is not None) 4106 assert(blob_id is not None) 4107 new_change = FileChange(b'D', filename) 4108 elif filename is None: 4109 continue # Drop the FileChange from this commit 4110 else: 4111 new_change = FileChange(b'M', filename, blob_id, mode) 4112 else: 4113 new_change = change # use change as-is for deletions 4114 new_file_changes.append(new_change) 4115 commit.file_changes = new_file_changes 4116 4117 # Call the user-defined callback, if any 4118 if self._commit_callback: 4119 self._commit_callback(commit, self.callback_metadata(aux_info)) 4120 4121 # Find out which files were modified by the callbacks. Such paths could 4122 # lead to subsequent commits being empty (e.g. if removing a line containing 4123 # a password from every version of a file that had the password, and some 4124 # later commit did nothing more than remove that line) 4125 final_file_changes = set(commit.file_changes) 4126 if self._args.replace_text or self._blob_callback: 4127 differences = orig_file_changes.union(final_file_changes) 4128 else: 4129 differences = orig_file_changes.symmetric_difference(final_file_changes) 4130 self._files_tweaked.update(x.filename for x in differences) 4131 4132 # Now print the resulting commit, or if prunable skip it 4133 if not commit.dumped: 4134 if not self._prunable(commit, new_1st_parent, 4135 aux_info['had_file_changes'], orig_parents): 4136 self._insert_into_stream(commit) 4137 self._record_remapping(commit, orig_parents) 4138 else: 4139 rewrite_to = new_1st_parent or commit.first_parent() 4140 commit.skip(new_id = rewrite_to) 4141 if self._args.state_branch: 4142 alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash) 4143 self._insert_into_stream(alias) 4144 if commit.branch.startswith(b'refs/') or commit.branch == b'HEAD': 4145 # The special check above is because when direct revisions are passed 4146 # along to fast-export (such as with stashes), there is a chance the 4147 # revision is rewritten to nothing. In such cases, we don't want to 4148 # point an invalid ref that just names a revision to some other point. 4149 reset = Reset(commit.branch, rewrite_to or deleted_hash) 4150 self._insert_into_stream(reset) 4151 self._commit_renames[commit.original_id] = None 4152 4153 # Show progress 4154 self._num_commits += 1 4155 if not self._args.quiet: 4156 self._progress_writer.show(self._parsed_message % self._num_commits) 4157 4158 @staticmethod 4159 def _do_tag_rename(rename_pair, tagname): 4160 old, new = rename_pair.split(b':', 1) 4161 old, new = b'refs/tags/'+old, b'refs/tags/'+new 4162 if tagname.startswith(old): 4163 return tagname.replace(old, new, 1) 4164 return tagname 4165 4166 def _tweak_tag(self, tag): 4167 # Tweak the tag message according to callbacks 4168 if self._args.replace_message: 4169 for literal, replacement in self._args.replace_message['literals']: 4170 tag.message = tag.message.replace(literal, replacement) 4171 for regex, replacement in self._args.replace_message['regexes']: 4172 tag.message = regex.sub(replacement, tag.message) 4173 if self._message_callback: 4174 tag.message = self._message_callback(tag.message) 4175 4176 # Tweak the tag name according to tag-name-related callbacks 4177 tag_prefix = b'refs/tags/' 4178 fullref = tag_prefix+tag.ref 4179 if self._args.tag_rename: 4180 fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref) 4181 if self._refname_callback: 4182 fullref = self._refname_callback(fullref) 4183 if not fullref.startswith(tag_prefix): 4184 msg = "Error: fast-import requires tags to be in refs/tags/ namespace." 4185 msg += "\n {} renamed to {}".format(tag_prefix+tag.ref, fullref) 4186 raise SystemExit(msg) 4187 tag.ref = fullref[len(tag_prefix):] 4188 4189 # Tweak the tagger according to callbacks 4190 if self._args.mailmap: 4191 tag.tagger_name, tag.tagger_email = \ 4192 self._args.mailmap.translate(tag.tagger_name, tag.tagger_email) 4193 if self._name_callback: 4194 tag.tagger_name = self._name_callback(tag.tagger_name) 4195 if self._email_callback: 4196 tag.tagger_email = self._email_callback(tag.tagger_email) 4197 4198 # Call general purpose tag callback 4199 if self._tag_callback: 4200 self._tag_callback(tag, self.callback_metadata()) 4201 4202 def _tweak_reset(self, reset): 4203 if self._args.tag_rename: 4204 reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref) 4205 if self._refname_callback: 4206 reset.ref = self._refname_callback(reset.ref) 4207 if self._reset_callback: 4208 self._reset_callback(reset, self.callback_metadata()) 4209 4210 def results_tmp_dir(self, create_if_missing=True): 4211 target_working_dir = self._args.target or b'.' 4212 git_dir = GitUtils.determine_git_dir(target_working_dir) 4213 d = os.path.join(git_dir, b'filter-repo') 4214 if create_if_missing and not os.path.isdir(d): 4215 os.mkdir(d) 4216 return d 4217 4218 def _load_marks_file(self, marks_basename): 4219 full_branch = 'refs/heads/{}'.format(self._args.state_branch) 4220 marks_file = os.path.join(self.results_tmp_dir(), marks_basename) 4221 working_dir = self._args.target or b'.' 4222 cmd = ['git', '-C', working_dir, 'show-ref', full_branch] 4223 contents = b'' 4224 if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: 4225 cmd = ['git', '-C', working_dir, 'show', 4226 '%s:%s' % (full_branch, decode(marks_basename))] 4227 try: 4228 contents = subproc.check_output(cmd) 4229 except subprocess.CalledProcessError as e: # pragma: no cover 4230 raise SystemExit(_("Failed loading %s from %s") % 4231 (decode(marks_basename), full_branch)) 4232 if contents: 4233 biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines()) 4234 _IDS._next_id = max(_IDS._next_id, biggest_id+1) 4235 with open(marks_file, 'bw') as f: 4236 f.write(contents) 4237 return marks_file 4238 4239 def _save_marks_files(self): 4240 basenames = [b'source-marks', b'target-marks'] 4241 working_dir = self._args.target or b'.' 4242 4243 # Check whether the branch exists 4244 parent = [] 4245 full_branch = 'refs/heads/{}'.format(self._args.state_branch) 4246 cmd = ['git', '-C', working_dir, 'show-ref', full_branch] 4247 if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: 4248 parent = ['-p', full_branch] 4249 4250 # Run 'git hash-object $MARKS_FILE' for each marks file, save result 4251 blob_hashes = {} 4252 for marks_basename in basenames: 4253 marks_file = os.path.join(self.results_tmp_dir(), marks_basename) 4254 if not os.path.isfile(marks_file): # pragma: no cover 4255 raise SystemExit(_("Failed to find %s to save to %s") 4256 % (marks_file, self._args.state_branch)) 4257 cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file] 4258 blob_hashes[marks_basename] = subproc.check_output(cmd).strip() 4259 4260 # Run 'git mktree' to create a tree out of it 4261 p = subproc.Popen(['git', '-C', working_dir, 'mktree'], 4262 stdin=subprocess.PIPE, stdout=subprocess.PIPE) 4263 for b in basenames: 4264 p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b)) 4265 p.stdin.close() 4266 p.wait() 4267 tree = p.stdout.read().strip() 4268 4269 # Create the new commit 4270 cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files', 4271 tree] + parent) 4272 commit = subproc.check_output(cmd).strip() 4273 subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit]) 4274 4275 def importer_only(self): 4276 self._run_sanity_checks() 4277 self._setup_output() 4278 4279 def set_output(self, outputRepoFilter): 4280 assert outputRepoFilter._output 4281 4282 # set_output implies this RepoFilter is doing exporting, though may not 4283 # be the only one. 4284 self._setup_input(use_done_feature = False) 4285 4286 # Set our output management up to pipe to outputRepoFilter's locations 4287 self._managed_output = False 4288 self._output = outputRepoFilter._output 4289 self._import_pipes = outputRepoFilter._import_pipes 4290 4291 # Handle sanity checks, though currently none needed for export-only cases 4292 self._run_sanity_checks() 4293 4294 def _read_stash(self): 4295 if self._stash: 4296 return 4297 if self._orig_refs and b'refs/stash' in self._orig_refs and \ 4298 self._args.refs == ['--all']: 4299 repo_working_dir = self._args.source or b'.' 4300 git_dir = GitUtils.determine_git_dir(repo_working_dir) 4301 stash = os.path.join(git_dir, b'logs', b'refs', b'stash') 4302 if os.path.exists(stash): 4303 self._stash = [] 4304 with open(stash, 'br') as f: 4305 for line in f: 4306 (oldhash, newhash, rest) = line.split(None, 2) 4307 self._stash.append((newhash, rest)) 4308 self._args.refs.extend([x[0] for x in self._stash]) 4309 4310 def _write_stash(self): 4311 last = deleted_hash 4312 if self._stash: 4313 target_working_dir = self._args.target or b'.' 4314 git_dir = GitUtils.determine_git_dir(target_working_dir) 4315 stash = os.path.join(git_dir, b'logs', b'refs', b'stash') 4316 with open(stash, 'bw') as f: 4317 for (hash, rest) in self._stash: 4318 new_hash = self._get_rename(hash) 4319 if new_hash is None: 4320 continue 4321 f.write(b' '.join([last, new_hash, rest]) + b'\n') 4322 last = new_hash 4323 print(_("Rewrote the stash.")) 4324 4325 def _setup_input(self, use_done_feature): 4326 if self._args.stdin: 4327 self._input = sys.stdin.detach() 4328 sys.stdin = None # Make sure no one tries to accidentally use it 4329 self._fe_orig = None 4330 else: 4331 self._read_stash() 4332 skip_blobs = (self._blob_callback is None and 4333 (self._args.replace_text is None or 4334 self._file_info_callback is not None) and 4335 self._args.source == self._args.target) 4336 extra_flags = [] 4337 if skip_blobs: 4338 extra_flags.append('--no-data') 4339 if self._args.max_blob_size: 4340 self._unpacked_size, packed_size = GitUtils.get_blob_sizes() 4341 if use_done_feature: 4342 extra_flags.append('--use-done-feature') 4343 if write_marks: 4344 extra_flags.append(b'--mark-tags') 4345 if self._args.state_branch: 4346 assert(write_marks) 4347 source_marks_file = self._load_marks_file(b'source-marks') 4348 extra_flags.extend([b'--export-marks='+source_marks_file, 4349 b'--import-marks='+source_marks_file]) 4350 if self._args.preserve_commit_encoding is not None: # pragma: no cover 4351 reencode = 'no' if self._args.preserve_commit_encoding else 'yes' 4352 extra_flags.append('--reencode='+reencode) 4353 if self._args.date_order: 4354 extra_flags.append('--date-order') 4355 location = ['-C', self._args.source] if self._args.source else [] 4356 fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids', 4357 '--signed-tags=strip', '--tag-of-filtered-object=rewrite', 4358 '--fake-missing-tagger', '--reference-excluded-parents' 4359 ] + extra_flags + self._args.refs 4360 self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE) 4361 self._input = self._fep.stdout 4362 if self._args.dry_run or self._args.debug: 4363 self._fe_orig = os.path.join(self.results_tmp_dir(), 4364 b'fast-export.original') 4365 output = open(self._fe_orig, 'bw') 4366 self._input = InputFileBackup(self._input, output) 4367 if self._args.debug: 4368 tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd] 4369 print("[DEBUG] Running: {}".format(' '.join(tmp))) 4370 print(" (saving a copy of the output at {})" 4371 .format(decode(self._fe_orig))) 4372 4373 def _setup_output(self): 4374 if not self._args.dry_run: 4375 location = ['-C', self._args.target] if self._args.target else [] 4376 fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false', 4377 'fast-import', '--force', '--quiet'] 4378 if date_format_permissive: 4379 fip_cmd.append('--date-format=raw-permissive') 4380 if self._args.state_branch: 4381 target_marks_file = self._load_marks_file(b'target-marks') 4382 fip_cmd.extend([b'--export-marks='+target_marks_file, 4383 b'--import-marks='+target_marks_file]) 4384 self._fip = subproc.Popen(fip_cmd, bufsize=-1, 4385 stdin=subprocess.PIPE, stdout=subprocess.PIPE) 4386 self._import_pipes = (self._fip.stdin, self._fip.stdout) 4387 if self._args.dry_run or self._args.debug: 4388 self._fe_filt = os.path.join(self.results_tmp_dir(), 4389 b'fast-export.filtered') 4390 self._output = open(self._fe_filt, 'bw') 4391 else: 4392 self._output = self._fip.stdin 4393 if self._args.debug and not self._args.dry_run: 4394 self._output = DualFileWriter(self._fip.stdin, self._output) 4395 tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd] 4396 print("[DEBUG] Running: {}".format(' '.join(tmp))) 4397 print(" (using the following file as input: {})" 4398 .format(decode(self._fe_filt))) 4399 4400 def _migrate_origin_to_heads(self): 4401 source_working_dir = self._args.source or b'.' 4402 target_working_dir = self._args.target or b'.' 4403 refs_to_migrate = set(x for x in self._orig_refs 4404 if x.startswith(b'refs/remotes/origin/')) 4405 refs_to_warn_about = set() 4406 if refs_to_migrate: 4407 if self._args.debug: 4408 print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*") 4409 p = subproc.Popen('git update-ref --no-deref --stdin'.split(), 4410 stdin=subprocess.PIPE, cwd=source_working_dir) 4411 for ref in refs_to_migrate: 4412 if ref == b'refs/remotes/origin/HEAD': 4413 p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) 4414 del self._orig_refs[ref] 4415 continue 4416 newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/') 4417 if newref not in self._orig_refs: 4418 p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref])) 4419 self._orig_refs[newref] = self._orig_refs[ref] 4420 elif self._orig_refs[ref] != self._orig_refs[newref]: 4421 refs_to_warn_about.add(newref) 4422 p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) 4423 del self._orig_refs[ref] 4424 p.stdin.close() 4425 if p.wait(): # pragma: no cover 4426 msg = _("git update-ref failed; see above") 4427 raise SystemExit(msg) 4428 4429 if b'remote.origin.url' not in self._config_settings: 4430 return 4431 4432 # For sensitive data removals, fetch ALL refs. Non-mirror clones normally 4433 # only grab branches and tags, but other refs may hold on to the sensitive 4434 # data as well. 4435 if self._args.sensitive_data_removal and \ 4436 not self._args.no_fetch and \ 4437 not self._already_ran and \ 4438 self._config_settings.get(b'remote.origin.mirror', b'false') != b'true': 4439 4440 if refs_to_warn_about: 4441 msg = ("Warning: You have refs modified from upstream:\n " + 4442 "\n ".join([decode(x) for x in refs_to_warn_about]) + 4443 "\n" + 4444 " We want to forcibly fetch from upstream to ensure\n" + 4445 " that all relevent refs are rewritten, but this will\n" + 4446 " discard your local changes before starting the\n" + 4447 " rewrite. Proceed with fetch (Y/N)?") 4448 response = input(msg) 4449 4450 if response.lower() != 'y': 4451 self._args.no_fetch = True 4452 # Don't do the fetch, and don't remove the origin remote 4453 return 4454 4455 cmd = 'git fetch -q --prune --update-head-ok --refmap "" origin +refs/*:refs/*' 4456 m = _("NOTICE: Fetching all refs from origin to make sure we rewrite\n" 4457 " all history that may reference the sensitive data, via\n" 4458 " "+cmd) 4459 print(m) 4460 ret = subproc.call([arg if arg != '""' else '' for arg in cmd.split()], 4461 cwd=source_working_dir) 4462 if ret != 0: # pragma: no cover 4463 m = _("Warning: Fetching all refs from origin failed") 4464 print(m) 4465 if self._args.sensitive_data_removal: 4466 return 4467 4468 # Now remove the origin remote 4469 url = self._config_settings[b'remote.origin.url'].decode(errors='replace') 4470 m = _("NOTICE: Removing 'origin' remote; see 'Why is my origin removed?'\n" 4471 " in the manual if you want to push back there.\n" 4472 " (was %s)") % url 4473 print(m) 4474 subproc.call('git remote rm origin'.split(), cwd=target_working_dir) 4475 4476 def _final_commands(self): 4477 self._finalize_handled = True 4478 self._done_callback and self._done_callback() 4479 4480 if self._file_info_value: 4481 self._file_info_value.finalize() 4482 if not self._args.quiet: 4483 self._progress_writer.finish() 4484 4485 def _ref_update(self, target_working_dir): 4486 # Start the update-ref process 4487 p = subproc.Popen('git update-ref --no-deref --stdin'.split(), 4488 stdin=subprocess.PIPE, 4489 cwd=target_working_dir) 4490 4491 # Remove replace_refs from _orig_refs 4492 replace_refs = {k:v for k, v in self._orig_refs.items() 4493 if k.startswith(b'refs/replace/')} 4494 reverse_replace_refs = collections.defaultdict(list) 4495 for k,v in replace_refs.items(): 4496 reverse_replace_refs[v].append(k) 4497 all(map(self._orig_refs.pop, replace_refs)) 4498 4499 # Remove unused refs 4500 exported_refs, imported_refs = self.get_exported_and_imported_refs() 4501 refs_to_nuke = exported_refs - imported_refs 4502 # Because revisions can be passed to fast-export which handles them as 4503 # though they were refs, we might have bad "refs" to nuke; strip them out. 4504 refs_to_nuke = [x for x in refs_to_nuke 4505 if x.startswith(b'refs/') or x == b'HEAD'] 4506 if self._args.partial: 4507 refs_to_nuke = set() 4508 if refs_to_nuke and self._args.debug: 4509 print("[DEBUG] Deleting the following refs:\n "+ 4510 decode(b"\n ".join(sorted(refs_to_nuke)))) 4511 p.stdin.write(b''.join([b"delete %s\n" % x 4512 for x in refs_to_nuke])) 4513 4514 # Delete or update and add replace_refs; note that fast-export automatically 4515 # handles 'update-no-add', we only need to take action for the other four 4516 # choices for replace_refs. 4517 self._flush_renames() 4518 actual_renames = {k:v for k,v in self._commit_renames.items() if k != v} 4519 if self._args.replace_refs in ['delete-no-add', 'delete-and-add']: 4520 # Delete old replace refs, if unwanted 4521 replace_refs_to_nuke = set(replace_refs) 4522 if self._args.replace_refs == 'delete-and-add': 4523 # git-update-ref won't allow us to update a ref twice, so be careful 4524 # to avoid deleting refs we'll later update 4525 replace_refs_to_nuke = replace_refs_to_nuke.difference( 4526 [b'refs/replace/'+x for x in actual_renames]) 4527 p.stdin.write(b''.join([b"delete %s\n" % x 4528 for x in replace_refs_to_nuke])) 4529 if self._args.replace_refs in ['delete-and-add', 'update-or-add', 4530 'update-and-add']: 4531 # Add new replace refs 4532 update_only = (self._args.replace_refs == 'update-or-add') 4533 p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new) 4534 for old,new in actual_renames.items() 4535 if new and not (update_only and 4536 old in reverse_replace_refs)])) 4537 4538 # Complete the update-ref process 4539 p.stdin.close() 4540 if p.wait(): 4541 raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover 4542 4543 def _remap_to(self, oldish_hash): 4544 ''' 4545 Given an oldish_hash (from the beginning of the current run), return: 4546 IF oldish_hash is NOT pruned: 4547 the hash of the rewrite of oldish_hash 4548 otherwise: 4549 the hash of the rewrite of the first unpruned ancestor of oldish_hash 4550 ''' 4551 old_id = self._orig_graph._hash_to_id[oldish_hash] 4552 new_id = _IDS.translate(old_id) 4553 new_hash = self._graph.git_hash[new_id] if new_id else deleted_hash 4554 return new_hash 4555 4556 def _compute_metadata(self, metadata_dir, orig_refs): 4557 # 4558 # First, handle commit_renames 4559 # 4560 old_commit_renames = dict() 4561 if not self._already_ran: 4562 commit_renames = {old: new 4563 for old, new in self._commit_renames.items() 4564 } 4565 else: 4566 # Read commit-map into old_commit_renames 4567 with open(os.path.join(metadata_dir, b'commit-map'), 'br') as f: 4568 f.readline() # Skip the header line 4569 for line in f: 4570 (old,new) = line.split() 4571 old_commit_renames[old] = new 4572 # Use A->B mappings in old_commit_renames, and B->C mappings in 4573 # self._commit_renames to yield A->C mappings in commit_renames 4574 commit_renames = {old: self._commit_renames.get(newish, newish) 4575 for old, newish in old_commit_renames.items()} 4576 # If there are any B->C mappings in self._commit_renames for which 4577 # there was no A->B mapping in old_commit_renames, then add the 4578 # B->C mapping to commit_renames too. 4579 seen = set(old_commit_renames.values()) 4580 commit_renames.update({old: new 4581 for old, new in self._commit_renames.items() 4582 if old not in seen}) 4583 4584 # 4585 # Second, handle ref_maps 4586 # 4587 exported_refs, imported_refs = self.get_exported_and_imported_refs() 4588 4589 old_commit_unrenames = dict() 4590 if not self._already_ran: 4591 old_ref_map = dict((refname, (old_hash, deleted_hash)) 4592 for refname, old_hash in orig_refs.items() 4593 if refname in exported_refs) 4594 else: 4595 # old_commit_renames talk about how commits were renamed in the original 4596 # run. Let's reverse it to find out how to get from the intermediate 4597 # commit name, back to the original. Because everything in orig_refs 4598 # right now refers to the intermediate commits after the first run(s), 4599 # and we need to map them back to what they were before any changes. 4600 old_commit_unrenames = dict((v,k) for (k,v) in old_commit_renames.items()) 4601 4602 old_ref_map = {} 4603 # Populate old_ref_map from the 'ref-map' file 4604 with open(os.path.join(metadata_dir, b'ref-map'), 'br') as f: 4605 f.readline() # Skip the header line 4606 for line in f: 4607 (old,intermediate,ref) = line.split() 4608 old_ref_map[ref] = (old, intermediate) 4609 # Append to old_ref_map items from orig_refs that were exported, but 4610 # get the actual original commit name 4611 for refname, old_hash in orig_refs.items(): 4612 if refname in old_ref_map: 4613 continue 4614 if refname not in exported_refs: 4615 continue 4616 # Compute older_hash 4617 original_hash = old_commit_unrenames.get(old_hash, old_hash) 4618 old_ref_map[refname] = (original_hash, deleted_hash) 4619 4620 new_refs = {} 4621 new_refs_initialized = False 4622 ref_maps = {} 4623 self._orig_graph._ensure_reverse_maps_populated() 4624 for refname, pair in old_ref_map.items(): 4625 old_hash, hash_ref_becomes_if_not_imported_in_this_run = pair 4626 if refname not in imported_refs: 4627 new_hash = hash_ref_becomes_if_not_imported_in_this_run 4628 elif old_hash in commit_renames: 4629 intermediate = old_commit_renames.get(old_hash,old_hash) 4630 if intermediate in self._commit_renames: 4631 new_hash = self._remap_to(intermediate) 4632 else: 4633 new_hash = intermediate 4634 else: # Must be either an annotated tag, or a ref whose tip was pruned 4635 if not new_refs_initialized: 4636 target_working_dir = self._args.target or b'.' 4637 new_refs = GitUtils.get_refs(target_working_dir) 4638 new_refs_initialized = True 4639 if refname in new_refs: 4640 new_hash = new_refs[refname] 4641 else: 4642 new_hash = deleted_hash 4643 ref_maps[refname] = (old_hash, new_hash) 4644 if self._args.source or self._args.target: 4645 if not new_refs_initialized: 4646 target_working_dir = self._args.target or b'.' 4647 new_refs = GitUtils.get_refs(target_working_dir) 4648 new_refs_initialized = True 4649 for ref, new_hash in new_refs.items(): 4650 if ref not in orig_refs and not ref.startswith(b'refs/replace/'): 4651 old_hash = b'0'*len(new_hash) 4652 ref_maps[ref] = (old_hash, new_hash) 4653 4654 # 4655 # Third, handle first_changes 4656 # 4657 4658 old_first_changes = dict() 4659 if self._already_ran: 4660 # Read first_changes into old_first_changes 4661 with open(os.path.join(metadata_dir, b'first-changed-commits'), 'br') as f: 4662 for line in f: 4663 changed_commit, undeleted_self_or_ancestor = line.strip().split() 4664 old_first_changes[changed_commit] = undeleted_self_or_ancestor 4665 # We need to find the commits that were modified whose parents were not. 4666 # To be able to find parents, we need the commit names as of the beginning 4667 # of this run, and then when we are done, we need to map them back to the 4668 # name of the commits from before any git-filter-repo runs. 4669 # 4670 # We are excluding here any commits deleted in previous git-filter-repo 4671 # runs 4672 undo_old_commit_renames = dict((v,k) for (k,v) in old_commit_renames.items() 4673 if v != deleted_hash) 4674 # Get a list of all commits that were changed, as of the beginning of 4675 # this latest run. 4676 changed_commits = {new 4677 for (old,new) in old_commit_renames.items() 4678 if old != new and new != deleted_hash} | \ 4679 {old 4680 for (old,new) in self._commit_renames.items() 4681 if old != new} 4682 special_changed_commits = {old 4683 for (old,new) in old_commit_renames.items() 4684 if new == deleted_hash} 4685 first_changes = dict() 4686 for (old,new) in self._commit_renames.items(): 4687 if old == new: 4688 # old wasn't modified, can't be first change if not even a change 4689 continue 4690 if old_commit_unrenames.get(old,old) != old: 4691 # old was already modified in previous run; while it might represent 4692 # something that is still a first change, we'll handle that as we 4693 # loop over old_first_changes below 4694 continue 4695 if any(parent in changed_commits 4696 for parent in self._orig_graph.get_parent_hashes(old)): 4697 # a parent of old was modified, so old is not a first change 4698 continue 4699 # At this point, old IS a first change. We need to find out what new 4700 # commit it maps to, or if it doesn't map to one, what new commit was 4701 # its most recent ancestor that wasn't pruned. 4702 if new is None: 4703 new = self._remap_to(old) 4704 first_changes[old] = (new if new is not None else deleted_hash) 4705 for (old,undeleted_self_or_ancestor) in old_first_changes.items(): 4706 if undeleted_self_or_ancestor == deleted_hash: 4707 # old represents a commit that was pruned and whose entire ancestry 4708 # was pruned. So, old is still a first change 4709 first_changes[old] = undeleted_self_or_ancestor 4710 continue 4711 intermediate = old_commit_renames.get(old, old) 4712 usoa = undeleted_self_or_ancestor 4713 new_ancestor = self._commit_renames.get(usoa, usoa) 4714 if intermediate == deleted_hash: 4715 # old was pruned in previous rewrite 4716 if usoa != new_ancestor: 4717 # old's ancestor got rewritten in this filtering run; we can drop 4718 # this one from first_changes. 4719 continue 4720 # Getting here means old was a first change and old was pruned in a 4721 # previous run, and its ancestors that survived were non rewritten in 4722 # this run, so old remains a first change 4723 first_changes[old] = new_ancestor # or usoa, since new_ancestor == usoa 4724 continue 4725 assert(usoa == intermediate) # old wasn't pruned => usoa == intermediate 4726 4727 # Check whether parents of intermediate were rewritten. Note that 4728 # intermediate in self._commit_renames only means that intermediate was 4729 # processed by the latest filtering (not necessarily that it changed), 4730 # but we need to know that before we can check for parent hashes having 4731 # changed. 4732 if intermediate not in self._commit_renames: 4733 # This commit was not processed by this run, so it remains a first 4734 # change 4735 first_changes[old] = usoa 4736 continue 4737 if any(parent in changed_commits 4738 for parent in self._orig_graph.get_parent_hashes(intermediate)): 4739 # An ancestor was modified by this run, so it is no longer a first 4740 # change; continue to the next one. 4741 continue 4742 # This change is a first_change; find the new commit its usoa maps to 4743 new = self._remap_to(intermediate) 4744 assert(new is not None) 4745 first_changes[old] = new 4746 4747 return commit_renames, ref_maps, first_changes 4748 4749 def _handle_lfs_metadata(self, metadata_dir): 4750 if self._lfs_object_tracker is None: 4751 print("NOTE: LFS object orphaning not checked (LFS not in use)") 4752 return 4753 4754 if self._args.partial: 4755 target_working_dir = self._args.target or b'.' 4756 source = False 4757 self._lfs_object_tracker.find_all_lfs_objects_in_repo(target_working_dir, 4758 source) 4759 4760 with open(os.path.join(metadata_dir, b'original_lfs_objects'), 'bw') as f: 4761 for obj in sorted(self._lfs_object_tracker.source_objects.objects): 4762 f.write(obj+b"\n") 4763 4764 orphaned_lfs_path = os.path.join(metadata_dir, b'orphaned_lfs_objects') 4765 msg = textwrap.dedent(_(f"""\ 4766 NOTE: There were LFS Objects Orphaned by this rewrite recorded in 4767 {decode(orphaned_lfs_path)}.""")) 4768 with open(orphaned_lfs_path, 'bw') as f: 4769 differences = self._lfs_object_tracker.source_objects.objects - \ 4770 self._lfs_object_tracker.target_objects.objects 4771 for obj in sorted(differences): 4772 f.write(obj+b"\n") 4773 if differences: 4774 self._lfs_object_tracker.objects_orphaned = True 4775 print(msg) 4776 4777 def _record_metadata(self, metadata_dir, orig_refs): 4778 self._flush_renames() 4779 commit_renames, ref_maps, first_changes = \ 4780 self._compute_metadata(metadata_dir, orig_refs) 4781 4782 if self._args.sensitive_data_removal: 4783 changed_commits = sum(k!=v for (k,v) in commit_renames.items()) 4784 print(f"You rewrote {changed_commits} (of {len(commit_renames)}) commits.") 4785 print("") # Add a blank line before important rewrite information 4786 print(f"NOTE: First Changed Commit(s) is/are:\n " 4787 + decode(b"\n ".join(x for x in first_changes))) 4788 4789 with open(os.path.join(metadata_dir, b'sensitive_data_removal'), 'bw') as f: 4790 pass # Write nothing; we only need the file created 4791 4792 self._handle_lfs_metadata(metadata_dir) 4793 print("") # Add a blank line after important rewrite information 4794 4795 with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f: 4796 f.write(("%-40s %s\n" % (_("old"), _("new"))).encode()) 4797 for (old,new) in sorted(commit_renames.items()): 4798 msg = b'%s %s\n' % (old, new if new != None else deleted_hash) 4799 f.write(msg) 4800 4801 with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f: 4802 f.write(("%-40s %-40s %s\n" % (_("old"), _("new"), _("ref"))).encode()) 4803 for refname, hash_pair in sorted(ref_maps.items()): 4804 (old_hash, new_hash) = hash_pair 4805 f.write(b'%s %s %s\n' % (old_hash, new_hash, refname)) 4806 if old_hash != new_hash: 4807 self._changed_refs.add(refname) 4808 4809 with open(os.path.join(metadata_dir, b'changed-refs'), 'bw') as f: 4810 for refname in sorted(self._changed_refs): 4811 f.write(b'%s\n' % refname) 4812 4813 with open(os.path.join(metadata_dir, b'first-changed-commits'), 'bw') as f: 4814 for commit, undeleted_self_or_ancestor in sorted(first_changes.items()): 4815 f.write(b'%s %s\n' % (commit, undeleted_self_or_ancestor)) 4816 4817 with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f: 4818 issues_found = False 4819 if self._commits_no_longer_merges: 4820 issues_found = True 4821 4822 f.write(textwrap.dedent(_(''' 4823 The following commits used to be merge commits but due to filtering 4824 are now regular commits; they likely have suboptimal commit messages 4825 (e.g. "Merge branch next into master"). Original commit hash on the 4826 left, commit hash after filtering/rewriting on the right: 4827 ''')[1:]).encode()) 4828 for oldhash, newhash in self._commits_no_longer_merges: 4829 f.write(' {} {}\n'.format(oldhash, newhash).encode()) 4830 f.write(b'\n') 4831 4832 if self._commits_referenced_but_removed: 4833 issues_found = True 4834 f.write(textwrap.dedent(_(''' 4835 The following commits were filtered out, but referenced in another 4836 commit message. The reference to the now-nonexistent commit hash 4837 (or a substring thereof) was left as-is in any commit messages: 4838 ''')[1:]).encode()) 4839 for bad_commit_reference in self._commits_referenced_but_removed: 4840 f.write(' {}\n'.format(bad_commit_reference).encode()) 4841 f.write(b'\n') 4842 4843 if not issues_found: 4844 f.write(_("No filtering problems encountered.\n").encode()) 4845 4846 with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f: 4847 f.write(_("This file exists to allow you to filter again without --force,\n" 4848 "and to specify that metadata files should be updated instead\n" 4849 "of rewritten").encode()) 4850 4851 def finish(self): 4852 ''' Alternative to run() when there is no input of our own to parse, 4853 meaning that run only really needs to close the handle to fast-import 4854 and let it finish, thus making a call to "run" feel like a misnomer. ''' 4855 assert not self._input 4856 assert self._managed_output 4857 self.run() 4858 4859 def insert(self, obj, direct_insertion = False): 4860 if not direct_insertion: 4861 if type(obj) == Blob: 4862 self._tweak_blob(obj) 4863 elif type(obj) == Commit: 4864 aux_info = {'orig_parents': obj.parents, 4865 'had_file_changes': bool(obj.file_changes)} 4866 self._tweak_commit(obj, aux_info) 4867 elif type(obj) == Reset: 4868 self._tweak_reset(obj) 4869 elif type(obj) == Tag: 4870 self._tweak_tag(obj) 4871 self._insert_into_stream(obj) 4872 4873 def _insert_into_stream(self, obj): 4874 if not obj.dumped: 4875 if self._lfs_object_tracker: 4876 self._lfs_object_tracker.check_output_object(obj) 4877 if self._parser: 4878 self._parser.insert(obj) 4879 else: 4880 obj.dump(self._output) 4881 4882 def get_exported_and_imported_refs(self): 4883 return self._parser.get_exported_and_imported_refs() 4884 4885 def run(self): 4886 start = time.time() 4887 if not self._input and not self._output: 4888 self._run_sanity_checks() 4889 if not self._args.dry_run and not self._args.partial: 4890 self._read_stash() 4891 self._migrate_origin_to_heads() 4892 self._setup_input(use_done_feature = True) 4893 self._setup_output() 4894 assert self._sanity_checks_handled 4895 4896 if self._input: 4897 # Create and run the filter 4898 self._repo_working_dir = self._args.source or b'.' 4899 self._parser = FastExportParser(blob_callback = self._tweak_blob, 4900 commit_callback = self._tweak_commit, 4901 tag_callback = self._tweak_tag, 4902 reset_callback = self._tweak_reset, 4903 done_callback = self._final_commands) 4904 self._setup_lfs_orphaning_checks() 4905 self._parser.run(self._input, self._output) 4906 if not self._finalize_handled: 4907 self._final_commands() 4908 4909 # Make sure fast-export completed successfully 4910 if not self._args.stdin and self._fep.wait(): 4911 raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover 4912 self._input.close() 4913 4914 # If we're not the manager of self._output, we should avoid post-run cleanup 4915 if not self._managed_output: 4916 return 4917 4918 # Close the output and ensure fast-import successfully completes 4919 self._output.close() 4920 if not self._args.dry_run and self._fip.wait(): 4921 raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover 4922 4923 # With fast-export and fast-import complete, update state if requested 4924 if self._args.state_branch: 4925 self._save_marks_files() 4926 4927 # Notify user how long it took, before doing a gc and such 4928 msg = "New history written in {:.2f} seconds..." 4929 if self._args.repack: 4930 msg = "New history written in {:.2f} seconds; now repacking/cleaning..." 4931 print(msg.format(time.time()-start)) 4932 4933 # Exit early, if requested 4934 if self._args.dry_run: 4935 print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed.")) 4936 if self._fe_orig: 4937 print(_(" Requested filtering can be seen by comparing:")) 4938 print(" " + decode(self._fe_orig)) 4939 else: 4940 print(_(" Requested filtering can be seen at:")) 4941 print(" " + decode(self._fe_filt)) 4942 return 4943 4944 target_working_dir = self._args.target or b'.' 4945 if self._input: 4946 self._ref_update(target_working_dir) 4947 4948 # Write out data about run 4949 self._record_metadata(self.results_tmp_dir(), self._orig_refs) 4950 4951 # Final cleanup: 4952 # If we need a repack, then nuke the reflogs and repack. 4953 # If we need a reset, do a reset --hard 4954 reset = not GitUtils.is_repository_bare(target_working_dir) 4955 self.cleanup(target_working_dir, self._args.repack, reset, 4956 run_quietly=self._args.quiet, 4957 show_debuginfo=self._args.debug) 4958 4959 # Let user know how long it took 4960 print(_("Completely finished after {:.2f} seconds.") 4961 .format(time.time()-start)) 4962 4963 # Give post-rewrite instructions for cleaning up other copies for SDR 4964 if self._args.sensitive_data_removal: 4965 lfs_note = "" 4966 if self._lfs_object_tracker and \ 4967 self._lfs_object_tracker.objects_orphaned == True: 4968 lfs_note = _(" and LFS Objects Orphaned") 4969 push_command = "git push --force --mirror origin" 4970 if self._args.no_fetch: 4971 if self._args.partial: 4972 push_command = "git push --force origin " + \ 4973 " ".join(sorted([decode(x) for x in self._changed_refs])) 4974 else: 4975 push_command = "git push --all --tags origin" 4976 print("") 4977 print(sdr_next_steps % (push_command, lfs_note, lfs_note)) 4978 4979def main(): 4980 setup_gettext() 4981 args = FilteringOptions.parse_args(sys.argv[1:]) 4982 if args.analyze: 4983 RepoAnalyze.run(args) 4984 else: 4985 filter = RepoFilter(args) 4986 filter.run() 4987 4988if __name__ == '__main__': 4989 main()