A monorepo management tool for the agentic ages
1#!/usr/bin/env python3
2
3"""
4git-filter-repo filters git repositories, similar to git filter-branch, BFG
5repo cleaner, and others. The basic idea is that it works by running
6 git fast-export <options> | filter | git fast-import <options>
7where this program not only launches the whole pipeline but also serves as
8the 'filter' in the middle. It does a few additional things on top as well
9in order to make it into a well-rounded filtering tool.
10
11git-filter-repo can also be used as a library for more involved filtering
12operations; however:
13 ***** API BACKWARD COMPATIBILITY CAVEAT *****
14 Programs using git-filter-repo as a library can reach pretty far into its
15 internals, but I am not prepared to guarantee backward compatibility of
16 all APIs. I suspect changes will be rare, but I reserve the right to
17 change any API. Since it is assumed that repository filtering is
18 something one would do very rarely, and in particular that it's a
19 one-shot operation, this should not be a problem in practice for anyone.
20 However, if you want to re-use a program you have written that uses
21 git-filter-repo as a library (or makes use of one of its --*-callback
22 arguments), you should either make sure you are using the same version of
23 git and git-filter-repo, or make sure to re-test it.
24
25 If there are particular pieces of the API you are concerned about, and
26 there is not already a testcase for it in t9391-lib-usage.sh or
27 t9392-python-callback.sh, please contribute a testcase. That will not
28 prevent me from changing the API, but it will allow you to look at the
29 history of a testcase to see whether and how the API changed.
30 ***** END API BACKWARD COMPATIBILITY CAVEAT *****
31"""
32
33import argparse
34import collections
35import fnmatch
36import gettext
37import io
38import os
39import platform
40import re
41import shutil
42import subprocess
43import sys
44import time
45import textwrap
46
47from datetime import tzinfo, timedelta, datetime
48
49__all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress",
50 "Checkpoint", "FastExportParser", "ProgressWriter",
51 "string_to_date", "date_to_string",
52 "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"]
53
54# The globals to make visible to callbacks. They will see all our imports for
55# free, as well as our public API.
56public_globals = ["__builtins__", "argparse", "collections", "fnmatch",
57 "gettext", "io", "os", "platform", "re", "shutil",
58 "subprocess", "sys", "time", "textwrap", "tzinfo",
59 "timedelta", "datetime"] + __all__
60
61deleted_hash = b'0'*40
62write_marks = True
63date_format_permissive = True
64
65def gettext_poison(msg):
66 if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover
67 return "# GETTEXT POISON #"
68 return gettext.gettext(msg)
69
70_ = gettext_poison
71
72def setup_gettext():
73 TEXTDOMAIN="git-filter-repo"
74 podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@"
75 if not os.path.isdir(podir): # pragma: no cover
76 podir = None # Python has its own fallback; use that
77
78 ## This looks like the most straightforward translation of the relevant
79 ## code in git.git:gettext.c and git.git:perl/Git/I18n.pm:
80 #import locale
81 #locale.setlocale(locale.LC_MESSAGES, "");
82 #locale.setlocale(locale.LC_TIME, "");
83 #locale.textdomain(TEXTDOMAIN);
84 #locale.bindtextdomain(TEXTDOMAIN, podir);
85 ## but the python docs suggest using the gettext module (which doesn't
86 ## have setlocale()) instead, so:
87 gettext.textdomain(TEXTDOMAIN);
88 gettext.bindtextdomain(TEXTDOMAIN, podir);
89
90def _timedelta_to_seconds(delta):
91 """
92 Converts timedelta to seconds
93 """
94 offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000
95 return round(offset)
96
97class FixedTimeZone(tzinfo):
98 """
99 Fixed offset in minutes east from UTC.
100 """
101
102 tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$')
103
104 def __init__(self, offset_string):
105 tzinfo.__init__(self)
106 sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups()
107 factor = -1 if (sign and sign == b'-') else 1
108 self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm)))
109 self._offset_string = offset_string
110
111 def utcoffset(self, dt):
112 return self._offset
113
114 def tzname(self, dt):
115 return self._offset_string
116
117 def dst(self, dt):
118 return timedelta(0)
119
120def string_to_date(datestring):
121 (unix_timestamp, tz_offset) = datestring.split()
122 return datetime.fromtimestamp(int(unix_timestamp),
123 FixedTimeZone(tz_offset))
124
125def date_to_string(dateobj):
126 epoch = datetime.fromtimestamp(0, dateobj.tzinfo)
127 return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)),
128 dateobj.tzinfo.tzname(0)))
129
130def decode(bytestr):
131 'Try to convert bytestr to utf-8 for outputting as an error message.'
132 return bytestr.decode('utf-8', 'backslashreplace')
133
134def glob_to_regex(glob_bytestr):
135 'Translate glob_bytestr into a regex on bytestrings'
136
137 # fnmatch.translate is idiotic and won't accept bytestrings
138 if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover
139 raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr))
140
141 # Create regex operating on string
142 regex = fnmatch.translate(decode(glob_bytestr))
143
144 # FIXME: This is an ugly hack...
145 # fnmatch.translate tries to do multi-line matching and wants the glob to
146 # match up to the end of the input, which isn't relevant for us, so we
147 # have to modify the regex. fnmatch.translate has used different regex
148 # constructs to achieve this with different python versions, so we have
149 # to check for each of them and then fix it up. It would be much better
150 # if fnmatch.translate could just take some flags to allow us to specify
151 # what we want rather than employing this hackery, but since it
152 # doesn't...
153 if regex.endswith(r'\Z(?ms)'): # pragma: no cover
154 regex = regex[0:-7]
155 elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover
156 regex = regex[4:-3]
157 elif regex.startswith(r'(?s:') and regex.endswith(r')\z'): # pragma: no cover
158 # Yaay, python3.14 for senselessly duplicating \Z as \z...
159 regex = regex[4:-3]
160
161 # Finally, convert back to regex operating on bytestr
162 return regex.encode()
163
164class PathQuoting:
165 _unescape = {b'a': b'\a',
166 b'b': b'\b',
167 b'f': b'\f',
168 b'n': b'\n',
169 b'r': b'\r',
170 b't': b'\t',
171 b'v': b'\v',
172 b'"': b'"',
173 b'\\':b'\\'}
174 _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})')
175 _escape = [bytes([x]) for x in range(127)]+[
176 b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)]
177 _reverse = dict(map(reversed, _unescape.items()))
178 for x in _reverse:
179 _escape[ord(x)] = b'\\'+_reverse[x]
180 _special_chars = [len(x) > 1 for x in _escape]
181
182 @staticmethod
183 def unescape_sequence(orig):
184 seq = orig.group(1)
185 return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)])
186
187 @staticmethod
188 def dequote(quoted_string):
189 if quoted_string.startswith(b'"'):
190 assert quoted_string.endswith(b'"')
191 return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence,
192 quoted_string[1:-1])
193 return quoted_string
194
195 @staticmethod
196 def enquote(unquoted_string):
197 # Option 1: Quoting when fast-export would:
198 # pqsc = PathQuoting._special_chars
199 # if any(pqsc[x] for x in set(unquoted_string)):
200 # Option 2, perf hack: do minimal amount of quoting required by fast-import
201 if unquoted_string.startswith(b'"') or b'\n' in unquoted_string:
202 pqe = PathQuoting._escape
203 return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"'
204 return unquoted_string
205
206class AncestryGraph(object):
207 """
208 A class that maintains a direct acycle graph of commits for the purpose of
209 determining if one commit is the ancestor of another.
210
211 A note about identifiers in Commit objects:
212 * Commit objects have 2 identifiers: commit.old_id and commit.id, because:
213 * The original fast-export stream identified commits by an identifier.
214 This is often an integer, but is sometimes a hash (particularly when
215 --reference-excluded-parents is provided)
216 * The new fast-import stream we use may not use the same identifiers.
217 If new blobs or commits are inserted (such as lint-history does), then
218 the integer (or hash) are no longer valid.
219
220 A note about identifiers in AncestryGraph objects, of which there are three:
221 * A given AncestryGraph is based on either commit.old_id or commit.id, but
222 not both. These are the keys for self.value.
223 * Using full hashes (occasionally) for children in self.graph felt
224 wasteful, so we use our own internal integer within self.graph.
225 self.value maps from commit {old_}id to our internal integer id.
226 * When working with commit.old_id, it is also sometimes useful to be able
227 to map these to the original hash, i.e. commit.original_id. So, we
228 also have self.git_hash for mapping from commit.old_id to git's commit
229 hash.
230 """
231
232 def __init__(self):
233 # The next internal identifier we will use; increments with every commit
234 # added to the AncestryGraph
235 self.cur_value = 0
236
237 # A mapping from the external identifers given to us to the simple integers
238 # we use in self.graph
239 self.value = {}
240
241 # A tuple of (depth, list-of-ancestors). Values and keys in this graph are
242 # all integers from the (values of the) self.value dict. The depth of a
243 # commit is one more than the max depth of any of its ancestors.
244 self.graph = {}
245
246 # A mapping from external identifier (i.e. from the keys of self.value) to
247 # the hash of the given commit. Only populated for graphs based on
248 # commit.old_id, since we won't know until later what the git_hash for
249 # graphs based on commit.id (since we have to wait for fast-import to
250 # create the commit and notify us of its hash; see _pending_renames).
251 # elsewhere
252 self.git_hash = {}
253
254 # Reverse maps; only populated if needed. Caller responsible to check
255 # and ensure they are populated
256 self._reverse_value = {}
257 self._hash_to_id = {}
258
259 # Cached results from previous calls to is_ancestor().
260 self._cached_is_ancestor = {}
261
262 def record_external_commits(self, external_commits):
263 """
264 Record in graph that each commit in external_commits exists, and is
265 treated as a root commit with no parents.
266 """
267 for c in external_commits:
268 if c not in self.value:
269 self.cur_value += 1
270 self.value[c] = self.cur_value
271 self.graph[self.cur_value] = (1, [])
272 self.git_hash[c] = c
273
274 def add_commit_and_parents(self, commit, parents, githash = None):
275 """
276 Record in graph that commit has the given parents (all identified by
277 fast export stream identifiers, usually integers but sometimes hashes).
278 parents _MUST_ have been first recorded. commit _MUST_ not have been
279 recorded yet. Also, record the mapping between commit and githash, if
280 githash is given.
281 """
282 assert all(p in self.value for p in parents)
283 assert commit not in self.value
284
285 # Get values for commit and parents
286 self.cur_value += 1
287 self.value[commit] = self.cur_value
288 if githash:
289 self.git_hash[commit] = githash
290 graph_parents = [self.value[x] for x in parents]
291
292 # Determine depth for commit, then insert the info into the graph
293 depth = 1
294 if parents:
295 depth += max(self.graph[p][0] for p in graph_parents)
296 self.graph[self.cur_value] = (depth, graph_parents)
297
298 def record_hash(self, commit_id, githash):
299 '''
300 If a githash was not recorded for commit_id, when add_commit_and_parents
301 was called, add it now.
302 '''
303 assert commit_id in self.value
304 assert commit_id not in self.git_hash
305 self.git_hash[commit_id] = githash
306
307 def _ensure_reverse_maps_populated(self):
308 if not self._hash_to_id:
309 assert not self._reverse_value
310 self._hash_to_id = {v: k for k, v in self.git_hash.items()}
311 self._reverse_value = {v: k for k, v in self.value.items()}
312
313 def get_parent_hashes(self, commit_hash):
314 '''
315 Given a commit_hash, return its parents hashes
316 '''
317 #
318 # We have to map:
319 # commit hash -> fast export stream id -> graph id
320 # then lookup
321 # parent graph ids for given graph id
322 # then we need to map
323 # parent graph ids -> parent fast export ids -> parent commit hashes
324 #
325 self._ensure_reverse_maps_populated()
326 commit_fast_export_id = self._hash_to_id[commit_hash]
327 commit_graph_id = self.value[commit_fast_export_id]
328 parent_graph_ids = self.graph[commit_graph_id][1]
329 parent_fast_export_ids = [self._reverse_value[x] for x in parent_graph_ids]
330 parent_hashes = [self.git_hash[x] for x in parent_fast_export_ids]
331 return parent_hashes
332
333 def map_to_hash(self, commit_id):
334 '''
335 Given a commit (by fast export stream id), return its hash
336 '''
337 return self.git_hash.get(commit_id, None)
338
339 def is_ancestor(self, possible_ancestor, check):
340 """
341 Return whether possible_ancestor is an ancestor of check
342 """
343 a, b = self.value[possible_ancestor], self.value[check]
344 original_pair = (a,b)
345 a_depth = self.graph[a][0]
346 ancestors = [b]
347 visited = set()
348 while ancestors:
349 ancestor = ancestors.pop()
350 prev_pair = (a, ancestor)
351 if prev_pair in self._cached_is_ancestor:
352 if not self._cached_is_ancestor[prev_pair]:
353 continue
354 self._cached_is_ancestor[original_pair] = True
355 return True
356 if ancestor in visited:
357 continue
358 visited.add(ancestor)
359 depth, more_ancestors = self.graph[ancestor]
360 if ancestor == a:
361 self._cached_is_ancestor[original_pair] = True
362 return True
363 elif depth <= a_depth:
364 continue
365 ancestors.extend(more_ancestors)
366 self._cached_is_ancestor[original_pair] = False
367 return False
368
369class MailmapInfo(object):
370 def __init__(self, filename):
371 self.changes = {}
372 self._parse_file(filename)
373
374 def _parse_file(self, filename):
375 name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*')
376 comment_re = re.compile(br'\s*#.*')
377 if not os.access(filename, os.R_OK):
378 raise SystemExit(_("Cannot read %s") % decode(filename))
379 with open(filename, 'br') as f:
380 count = 0
381 for line in f:
382 count += 1
383 err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line)
384 # Remove comments
385 line = comment_re.sub(b'', line)
386 # Remove leading and trailing whitespace
387 line = line.strip()
388 if not line:
389 continue
390
391 m = name_and_email_re.match(line)
392 if not m:
393 raise SystemExit(err)
394 proper_name, proper_email = m.groups()
395 if len(line) == m.end():
396 self.changes[(None, proper_email)] = (proper_name, proper_email)
397 continue
398 rest = line[m.end():]
399 m = name_and_email_re.match(rest)
400 if m:
401 commit_name, commit_email = m.groups()
402 if len(rest) != m.end():
403 raise SystemExit(err)
404 else:
405 commit_name, commit_email = rest, None
406 self.changes[(commit_name, commit_email)] = (proper_name, proper_email)
407
408 def translate(self, name, email):
409 ''' Given a name and email, return the expected new name and email from the
410 mailmap if there is a translation rule for it, otherwise just return
411 the given name and email.'''
412 for old, new in self.changes.items():
413 old_name, old_email = old
414 new_name, new_email = new
415 if (old_email is None or email.lower() == old_email.lower()) and (
416 name == old_name or not old_name):
417 return (new_name or name, new_email or email)
418 return (name, email)
419
420class ProgressWriter(object):
421 def __init__(self):
422 self._last_progress_update = time.time()
423 self._last_message = None
424
425 def show(self, msg):
426 self._last_message = msg
427 now = time.time()
428 if now - self._last_progress_update > .1:
429 self._last_progress_update = now
430 sys.stdout.write("\r{}".format(msg))
431 sys.stdout.flush()
432
433 def finish(self):
434 self._last_progress_update = 0
435 if self._last_message:
436 self.show(self._last_message)
437 sys.stdout.write("\n")
438
439class _IDs(object):
440 """
441 A class that maintains the 'name domain' of all the 'marks' (short int
442 id for a blob/commit git object). There are two reasons this mechanism
443 is necessary:
444 (1) the output text of fast-export may refer to an object using a different
445 mark than the mark that was assigned to that object using IDS.new().
446 (This class allows you to translate the fast-export marks, "old" to
447 the marks assigned from IDS.new(), "new").
448 (2) when we prune a commit, its "old" id becomes invalid. Any commits
449 which had that commit as a parent needs to use the nearest unpruned
450 ancestor as its parent instead.
451
452 Note that for purpose (1) above, this typically comes about because the user
453 manually creates Blob or Commit objects (for insertion into the stream).
454 It could also come about if we attempt to read the data from two different
455 repositories and trying to combine the data (git fast-export will number ids
456 from 1...n, and having two 1's, two 2's, two 3's, causes issues; granted, we
457 this scheme doesn't handle the two streams perfectly either, but if the first
458 fast export stream is entirely processed and handled before the second stream
459 is started, this mechanism may be sufficient to handle it).
460 """
461
462 def __init__(self):
463 """
464 Init
465 """
466 # The id for the next created blob/commit object
467 self._next_id = 1
468
469 # A map of old-ids to new-ids (1:1 map)
470 self._translation = {}
471
472 # A map of new-ids to every old-id that points to the new-id (1:N map)
473 self._reverse_translation = {}
474
475 def has_renames(self):
476 """
477 Return whether there have been ids remapped to new values
478 """
479 return bool(self._translation)
480
481 def new(self):
482 """
483 Should be called whenever a new blob or commit object is created. The
484 returned value should be used as the id/mark for that object.
485 """
486 rv = self._next_id
487 self._next_id += 1
488 return rv
489
490 def record_rename(self, old_id, new_id, handle_transitivity = False):
491 """
492 Record that old_id is being renamed to new_id.
493 """
494 if old_id != new_id or old_id in self._translation:
495 # old_id -> new_id
496 self._translation[old_id] = new_id
497
498 # Transitivity will be needed if new commits are being inserted mid-way
499 # through a branch.
500 if handle_transitivity:
501 # Anything that points to old_id should point to new_id
502 if old_id in self._reverse_translation:
503 for id_ in self._reverse_translation[old_id]:
504 self._translation[id_] = new_id
505
506 # Record that new_id is pointed to by old_id
507 if new_id not in self._reverse_translation:
508 self._reverse_translation[new_id] = []
509 self._reverse_translation[new_id].append(old_id)
510
511 def translate(self, old_id):
512 """
513 If old_id has been mapped to an alternate id, return the alternate id.
514 """
515 if old_id in self._translation:
516 return self._translation[old_id]
517 else:
518 return old_id
519
520 def __str__(self):
521 """
522 Convert IDs to string; used for debugging
523 """
524 rv = "Current count: %d\nTranslation:\n" % self._next_id
525 for k in sorted(self._translation):
526 rv += " %d -> %s\n" % (k, self._translation[k])
527
528 rv += "Reverse translation:\n"
529 reverse_keys = list(self._reverse_translation.keys())
530 if None in reverse_keys: # pragma: no cover
531 reverse_keys.remove(None)
532 reverse_keys = sorted(reverse_keys)
533 reverse_keys.append(None)
534 for k in reverse_keys:
535 rv += " " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n"
536
537 return rv
538
539class _GitElement(object):
540 """
541 The base class for all git elements that we create.
542 """
543
544 def __init__(self):
545 # A string that describes what type of Git element this is
546 self.type = None
547
548 # A flag telling us if this Git element has been dumped
549 # (i.e. printed) or skipped. Typically elements that have been
550 # dumped or skipped will not be dumped again.
551 self.dumped = 0
552
553 def dump(self, file_):
554 """
555 This version should never be called. Derived classes need to
556 override! We should note that subclasses should implement this
557 method such that the output would match the format produced by
558 fast-export.
559 """
560 raise SystemExit(_("Unimplemented function: %s") % type(self).__name__
561 +".dump()") # pragma: no cover
562
563 def __bytes__(self):
564 """
565 Convert GitElement to bytestring; used for debugging
566 """
567 old_dumped = self.dumped
568 writeme = io.BytesIO()
569 self.dump(writeme)
570 output_lines = writeme.getvalue().splitlines()
571 writeme.close()
572 self.dumped = old_dumped
573 return b"%s:\n %s" % (type(self).__name__.encode(),
574 b"\n ".join(output_lines))
575
576 def skip(self, new_id=None):
577 """
578 Ensures this element will not be written to output
579 """
580 self.dumped = 2
581
582class _GitElementWithId(_GitElement):
583 """
584 The base class for Git elements that have IDs (commits and blobs)
585 """
586
587 def __init__(self):
588 _GitElement.__init__(self)
589
590 # The mark (short, portable id) for this element
591 self.id = _IDS.new()
592
593 # The previous mark for this element
594 self.old_id = None
595
596 def skip(self, new_id=None):
597 """
598 This element will no longer be automatically written to output. When a
599 commit gets skipped, it's ID will need to be translated to that of its
600 parent.
601 """
602 self.dumped = 2
603
604 _IDS.record_rename(self.old_id or self.id, new_id)
605
606class Blob(_GitElementWithId):
607 """
608 This class defines our representation of git blob elements (i.e. our
609 way of representing file contents).
610 """
611
612 def __init__(self, data, original_id = None):
613 _GitElementWithId.__init__(self)
614
615 # Denote that this is a blob
616 self.type = 'blob'
617
618 # Record original id
619 self.original_id = original_id
620
621 # Stores the blob's data
622 assert(type(data) == bytes)
623 self.data = data
624
625 def dump(self, file_):
626 """
627 Write this blob element to a file.
628 """
629 self.dumped = 1
630 BLOB_HASH_TO_NEW_ID[self.original_id] = self.id
631 BLOB_NEW_ID_TO_HASH[self.id] = self.original_id
632
633 file_.write(b'blob\n')
634 file_.write(b'mark :%d\n' % self.id)
635 file_.write(b'data %d\n%s' % (len(self.data), self.data))
636 file_.write(b'\n')
637
638
639class Reset(_GitElement):
640 """
641 This class defines our representation of git reset elements. A reset
642 event is the creation (or recreation) of a named branch, optionally
643 starting from a specific revision).
644 """
645
646 def __init__(self, ref, from_ref = None):
647 _GitElement.__init__(self)
648
649 # Denote that this is a reset
650 self.type = 'reset'
651
652 # The name of the branch being (re)created
653 self.ref = ref
654
655 # Some reference to the branch/commit we are resetting from
656 self.from_ref = from_ref
657
658 def dump(self, file_):
659 """
660 Write this reset element to a file
661 """
662 self.dumped = 1
663
664 file_.write(b'reset %s\n' % self.ref)
665 if self.from_ref:
666 if isinstance(self.from_ref, int):
667 file_.write(b'from :%d\n' % self.from_ref)
668 else:
669 file_.write(b'from %s\n' % self.from_ref)
670 file_.write(b'\n')
671
672class FileChange(_GitElement):
673 """
674 This class defines our representation of file change elements. File change
675 elements are components within a Commit element.
676 """
677
678 def __init__(self, type_, filename = None, id_ = None, mode = None):
679 _GitElement.__init__(self)
680
681 # Denote the type of file-change (b'M' for modify, b'D' for delete, etc)
682 # We could
683 # assert(type(type_) == bytes)
684 # here but I don't just due to worries about performance overhead...
685 self.type = type_
686
687 # Record the name of the file being changed
688 self.filename = filename
689
690 # Record the mode (mode describes type of file entry (non-executable,
691 # executable, or symlink)).
692 self.mode = mode
693
694 # blob_id is the id (mark) of the affected blob
695 self.blob_id = id_
696
697 if type_ == b'DELETEALL':
698 assert filename is None and id_ is None and mode is None
699 self.filename = b'' # Just so PathQuoting.enquote doesn't die
700 else:
701 assert filename is not None
702
703 if type_ == b'M':
704 assert id_ is not None and mode is not None
705 elif type_ == b'D':
706 assert id_ is None and mode is None
707 elif type_ == b'R': # pragma: no cover (now avoid fast-export renames)
708 assert mode is None
709 if id_ is None:
710 raise SystemExit(_("new name needed for rename of %s") % filename)
711 self.filename = (self.filename, id_)
712 self.blob_id = None
713
714 def dump(self, file_):
715 """
716 Write this file-change element to a file
717 """
718 skipped_blob = (self.type == b'M' and self.blob_id is None)
719 if skipped_blob: return
720 self.dumped = 1
721
722 quoted_filename = PathQuoting.enquote(self.filename)
723 if self.type == b'M' and isinstance(self.blob_id, int):
724 file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename))
725 elif self.type == b'M':
726 file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename))
727 elif self.type == b'D':
728 file_.write(b'D %s\n' % quoted_filename)
729 elif self.type == b'DELETEALL':
730 file_.write(b'deleteall\n')
731 else:
732 raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover
733
734class Commit(_GitElementWithId):
735 """
736 This class defines our representation of commit elements. Commit elements
737 contain all the information associated with a commit.
738 """
739
740 def __init__(self, branch,
741 author_name, author_email, author_date,
742 committer_name, committer_email, committer_date,
743 message,
744 file_changes,
745 parents,
746 original_id = None,
747 encoding = None, # encoding for message; None implies UTF-8
748 **kwargs):
749 _GitElementWithId.__init__(self)
750 self.old_id = self.id
751
752 # Denote that this is a commit element
753 self.type = 'commit'
754
755 # Record the affected branch
756 self.branch = branch
757
758 # Record original id
759 self.original_id = original_id
760
761 # Record author's name
762 self.author_name = author_name
763
764 # Record author's email
765 self.author_email = author_email
766
767 # Record date of authoring
768 self.author_date = author_date
769
770 # Record committer's name
771 self.committer_name = committer_name
772
773 # Record committer's email
774 self.committer_email = committer_email
775
776 # Record date the commit was made
777 self.committer_date = committer_date
778
779 # Record commit message and its encoding
780 self.encoding = encoding
781 self.message = message
782
783 # List of file-changes associated with this commit. Note that file-changes
784 # are also represented as git elements
785 self.file_changes = file_changes
786
787 self.parents = parents
788
789 def dump(self, file_):
790 """
791 Write this commit element to a file.
792 """
793 self.dumped = 1
794
795 # Make output to fast-import slightly easier for humans to read if the
796 # message has no trailing newline of its own; cosmetic, but a nice touch...
797 extra_newline = b'\n'
798 if self.message.endswith(b'\n') or not (self.parents or self.file_changes):
799 extra_newline = b''
800
801 if not self.parents:
802 file_.write(b'reset %s\n' % self.branch)
803 file_.write((b'commit %s\n'
804 b'mark :%d\n'
805 b'author %s <%s> %s\n'
806 b'committer %s <%s> %s\n'
807 ) % (
808 self.branch, self.id,
809 self.author_name, self.author_email, self.author_date,
810 self.committer_name, self.committer_email, self.committer_date
811 ))
812 if self.encoding:
813 file_.write(b'encoding %s\n' % self.encoding)
814 file_.write(b'data %d\n%s%s' %
815 (len(self.message), self.message, extra_newline))
816 for i, parent in enumerate(self.parents):
817 file_.write(b'from ' if i==0 else b'merge ')
818 if isinstance(parent, int):
819 file_.write(b':%d\n' % parent)
820 else:
821 file_.write(b'%s\n' % parent)
822 for change in self.file_changes:
823 change.dump(file_)
824 if not self.parents and not self.file_changes:
825 # Workaround a bug in pre-git-2.22 versions of fast-import with
826 # the get-mark directive.
827 file_.write(b'\n')
828 file_.write(b'\n')
829
830 def first_parent(self):
831 """
832 Return first parent commit
833 """
834 if self.parents:
835 return self.parents[0]
836 return None
837
838 def skip(self, new_id=None):
839 _SKIPPED_COMMITS.add(self.old_id or self.id)
840 _GitElementWithId.skip(self, new_id)
841
842class Tag(_GitElementWithId):
843 """
844 This class defines our representation of annotated tag elements.
845 """
846
847 def __init__(self, ref, from_ref,
848 tagger_name, tagger_email, tagger_date, tag_msg,
849 original_id = None):
850 _GitElementWithId.__init__(self)
851 self.old_id = self.id
852
853 # Denote that this is a tag element
854 self.type = 'tag'
855
856 # Store the name of the tag
857 self.ref = ref
858
859 # Store the entity being tagged (this should be a commit)
860 self.from_ref = from_ref
861
862 # Record original id
863 self.original_id = original_id
864
865 # Store the name of the tagger
866 self.tagger_name = tagger_name
867
868 # Store the email of the tagger
869 self.tagger_email = tagger_email
870
871 # Store the date
872 self.tagger_date = tagger_date
873
874 # Store the tag message
875 self.message = tag_msg
876
877 def dump(self, file_):
878 """
879 Write this tag element to a file
880 """
881
882 self.dumped = 1
883
884 file_.write(b'tag %s\n' % self.ref)
885 if (write_marks and self.id):
886 file_.write(b'mark :%d\n' % self.id)
887 markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n'
888 file_.write(markfmt % self.from_ref)
889 if self.tagger_name:
890 file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email))
891 file_.write(self.tagger_date)
892 file_.write(b'\n')
893 file_.write(b'data %d\n%s' % (len(self.message), self.message))
894 file_.write(b'\n')
895
896class Progress(_GitElement):
897 """
898 This class defines our representation of progress elements. The progress
899 element only contains a progress message, which is printed by fast-import
900 when it processes the progress output.
901 """
902
903 def __init__(self, message):
904 _GitElement.__init__(self)
905
906 # Denote that this is a progress element
907 self.type = 'progress'
908
909 # Store the progress message
910 self.message = message
911
912 def dump(self, file_):
913 """
914 Write this progress element to a file
915 """
916 self.dumped = 1
917
918 file_.write(b'progress %s\n' % self.message)
919 file_.write(b'\n')
920
921class Checkpoint(_GitElement):
922 """
923 This class defines our representation of checkpoint elements. These
924 elements represent events which force fast-import to close the current
925 packfile, start a new one, and to save out all current branch refs, tags
926 and marks.
927 """
928
929 def __init__(self):
930 _GitElement.__init__(self)
931
932 # Denote that this is a checkpoint element
933 self.type = 'checkpoint'
934
935 def dump(self, file_):
936 """
937 Write this checkpoint element to a file
938 """
939 self.dumped = 1
940
941 file_.write(b'checkpoint\n')
942 file_.write(b'\n')
943
944class LiteralCommand(_GitElement):
945 """
946 This class defines our representation of commands. The literal command
947 includes only a single line, and is not processed in any special way.
948 """
949
950 def __init__(self, line):
951 _GitElement.__init__(self)
952
953 # Denote that this is a literal element
954 self.type = 'literal'
955
956 # Store the command
957 self.line = line
958
959 def dump(self, file_):
960 """
961 Write this progress element to a file
962 """
963 self.dumped = 1
964
965 file_.write(self.line)
966
967class Alias(_GitElement):
968 """
969 This class defines our representation of fast-import alias elements. An
970 alias element is the setting of one mark to the same sha1sum as another,
971 usually because the newer mark corresponded to a pruned commit.
972 """
973
974 def __init__(self, ref, to_ref):
975 _GitElement.__init__(self)
976 # Denote that this is a reset
977 self.type = 'alias'
978
979 self.ref = ref
980 self.to_ref = to_ref
981
982 def dump(self, file_):
983 """
984 Write this reset element to a file
985 """
986 self.dumped = 1
987
988 file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref))
989
990class FastExportParser(object):
991 """
992 A class for parsing and handling the output from fast-export. This
993 class allows the user to register callbacks when various types of
994 data are encountered in the fast-export output. The basic idea is that,
995 FastExportParser takes fast-export output, creates the various objects
996 as it encounters them, the user gets to use/modify these objects via
997 callbacks, and finally FastExportParser outputs the modified objects
998 in fast-import format (presumably so they can be used to create a new
999 repo).
1000 """
1001
1002 def __init__(self,
1003 tag_callback = None, commit_callback = None,
1004 blob_callback = None, progress_callback = None,
1005 reset_callback = None, checkpoint_callback = None,
1006 done_callback = None):
1007 # Members below simply store callback functions for the various git
1008 # elements
1009 self._tag_callback = tag_callback
1010 self._blob_callback = blob_callback
1011 self._reset_callback = reset_callback
1012 self._commit_callback = commit_callback
1013 self._progress_callback = progress_callback
1014 self._checkpoint_callback = checkpoint_callback
1015 self._done_callback = done_callback
1016
1017 # Keep track of which refs appear from the export, and which make it to
1018 # the import (pruning of empty commits, renaming of refs, and creating
1019 # new manual objects and inserting them can cause these to differ).
1020 self._exported_refs = set()
1021 self._imported_refs = set()
1022
1023 # A list of the branches we've seen, plus the last known commit they
1024 # pointed to. An entry in latest_*commit will be deleted if we get a
1025 # reset for that branch. These are used because of fast-import's weird
1026 # decision to allow having an implicit parent via naming the branch
1027 # instead of requiring branches to be specified via 'from' directives.
1028 self._latest_commit = {}
1029 self._latest_orig_commit = {}
1030
1031 # A handle to the input source for the fast-export data
1032 self._input = None
1033
1034 # A handle to the output file for the output we generate (we call dump
1035 # on many of the git elements we create).
1036 self._output = None
1037
1038 # Stores the contents of the current line of input being parsed
1039 self._currentline = ''
1040
1041 # Tracks LFS objects we have found
1042 self._lfs_object_tracker = None
1043
1044 # Compile some regexes and cache those
1045 self._mark_re = re.compile(br'mark :(\d+)\n$')
1046 self._parent_regexes = {}
1047 parent_regex_rules = (br' :(\d+)\n$', br' ([0-9a-f]{40})\n')
1048 for parent_refname in (b'from', b'merge'):
1049 ans = [re.compile(parent_refname+x) for x in parent_regex_rules]
1050 self._parent_regexes[parent_refname] = ans
1051 self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"')
1052 self._refline_regexes = {}
1053 for refline_name in (b'reset', b'commit', b'tag', b'progress'):
1054 self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$')
1055 self._user_regexes = {}
1056 for user in (b'author', b'committer', b'tagger'):
1057 self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$')
1058
1059 def _advance_currentline(self):
1060 """
1061 Grab the next line of input
1062 """
1063 self._currentline = self._input.readline()
1064
1065 def _parse_optional_mark(self):
1066 """
1067 If the current line contains a mark, parse it and advance to the
1068 next line; return None otherwise
1069 """
1070 mark = None
1071 matches = self._mark_re.match(self._currentline)
1072 if matches:
1073 mark = int(matches.group(1))
1074 self._advance_currentline()
1075 return mark
1076
1077 def _parse_optional_parent_ref(self, refname):
1078 """
1079 If the current line contains a reference to a parent commit, then
1080 parse it and advance the current line; otherwise return None. Note
1081 that the name of the reference ('from', 'merge') must match the
1082 refname arg.
1083 """
1084 orig_baseref, baseref = None, None
1085 rule, altrule = self._parent_regexes[refname]
1086 matches = rule.match(self._currentline)
1087 if matches:
1088 orig_baseref = int(matches.group(1))
1089 # We translate the parent commit mark to what it needs to be in
1090 # our mark namespace
1091 baseref = _IDS.translate(orig_baseref)
1092 self._advance_currentline()
1093 else:
1094 matches = altrule.match(self._currentline)
1095 if matches:
1096 orig_baseref = matches.group(1)
1097 baseref = orig_baseref
1098 self._advance_currentline()
1099 return orig_baseref, baseref
1100
1101 def _parse_optional_filechange(self):
1102 """
1103 If the current line contains a file-change object, then parse it
1104 and advance the current line; otherwise return None. We only care
1105 about file changes of type b'M' and b'D' (these are the only types
1106 of file-changes that fast-export will provide).
1107 """
1108 filechange = None
1109 changetype = self._currentline[0:1]
1110 if changetype == b'M':
1111 (changetype, mode, idnum, path) = self._currentline.split(None, 3)
1112 if idnum[0:1] == b':':
1113 idnum = idnum[1:]
1114 path = path.rstrip(b'\n')
1115 # Check for LFS objects from sources before we might toss this filechange
1116 if mode != b'160000' and self._lfs_object_tracker:
1117 value = int(idnum) if len(idnum) != 40 else idnum
1118 self._lfs_object_tracker.check_file_change_data(value, True)
1119 # We translate the idnum to our id system
1120 if len(idnum) != 40:
1121 idnum = _IDS.translate( int(idnum) )
1122 if idnum is not None:
1123 if path.startswith(b'"'):
1124 path = PathQuoting.dequote(path)
1125 filechange = FileChange(b'M', path, idnum, mode)
1126 else:
1127 filechange = b'skipped'
1128 self._advance_currentline()
1129 elif changetype == b'D':
1130 (changetype, path) = self._currentline.split(None, 1)
1131 path = path.rstrip(b'\n')
1132 if path.startswith(b'"'):
1133 path = PathQuoting.dequote(path)
1134 filechange = FileChange(b'D', path)
1135 self._advance_currentline()
1136 elif changetype == b'R': # pragma: no cover (now avoid fast-export renames)
1137 rest = self._currentline[2:-1]
1138 if rest.startswith(b'"'):
1139 m = self._quoted_string_re.match(rest)
1140 if not m:
1141 raise SystemExit(_("Couldn't parse rename source"))
1142 orig = PathQuoting.dequote(m.group(0))
1143 new = rest[m.end()+1:]
1144 else:
1145 orig, new = rest.split(b' ', 1)
1146 if new.startswith(b'"'):
1147 new = PathQuoting.dequote(new)
1148 filechange = FileChange(b'R', orig, new)
1149 self._advance_currentline()
1150 return filechange
1151
1152 def _parse_original_id(self):
1153 original_id = self._currentline[len(b'original-oid '):].rstrip()
1154 self._advance_currentline()
1155 return original_id
1156
1157 def _parse_encoding(self):
1158 encoding = self._currentline[len(b'encoding '):].rstrip()
1159 self._advance_currentline()
1160 return encoding
1161
1162 def _parse_ref_line(self, refname):
1163 """
1164 Parses string data (often a branch name) from current-line. The name of
1165 the string data must match the refname arg. The program will crash if
1166 current-line does not match, so current-line will always be advanced if
1167 this method returns.
1168 """
1169 matches = self._refline_regexes[refname].match(self._currentline)
1170 if not matches:
1171 raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") %
1172 ({'refname': refname, 'line':self._currentline})
1173 ) # pragma: no cover
1174 ref = matches.group(1)
1175 self._advance_currentline()
1176 return ref
1177
1178 def _parse_user(self, usertype):
1179 """
1180 Get user name, email, datestamp from current-line. Current-line will
1181 be advanced.
1182 """
1183 user_regex = self._user_regexes[usertype]
1184 (name, email, when) = user_regex.match(self._currentline).groups()
1185
1186 self._advance_currentline()
1187 return (name, email, when)
1188
1189 def _parse_data(self):
1190 """
1191 Reads data from _input. Current-line will be advanced until it is beyond
1192 the data.
1193 """
1194 fields = self._currentline.split()
1195 assert fields[0] == b'data'
1196 size = int(fields[1])
1197 data = self._input.read(size)
1198 self._advance_currentline()
1199 if self._currentline == b'\n':
1200 self._advance_currentline()
1201 return data
1202
1203 def _parse_blob(self):
1204 """
1205 Parse input data into a Blob object. Once the Blob has been created, it
1206 will be handed off to the appropriate callbacks. Current-line will be
1207 advanced until it is beyond this blob's data. The Blob will be dumped
1208 to _output once everything else is done (unless it has been skipped by
1209 the callback).
1210 """
1211 # Parse the Blob
1212 self._advance_currentline()
1213 id_ = self._parse_optional_mark()
1214
1215 original_id = None
1216 if self._currentline.startswith(b'original-oid'):
1217 original_id = self._parse_original_id();
1218
1219 data = self._parse_data()
1220 if self._currentline == b'\n':
1221 self._advance_currentline()
1222
1223 # Create the blob
1224 blob = Blob(data, original_id)
1225
1226 # If fast-export text had a mark for this blob, need to make sure this
1227 # mark translates to the blob's true id.
1228 if id_:
1229 blob.old_id = id_
1230 _IDS.record_rename(id_, blob.id)
1231
1232 # Check for LFS objects
1233 if self._lfs_object_tracker:
1234 self._lfs_object_tracker.check_blob_data(data, blob.old_id, True)
1235
1236 # Call any user callback to allow them to use/modify the blob
1237 if self._blob_callback:
1238 self._blob_callback(blob)
1239
1240 # Now print the resulting blob
1241 if not blob.dumped:
1242 blob.dump(self._output)
1243
1244 def _parse_reset(self):
1245 """
1246 Parse input data into a Reset object. Once the Reset has been created,
1247 it will be handed off to the appropriate callbacks. Current-line will
1248 be advanced until it is beyond the reset data. The Reset will be dumped
1249 to _output once everything else is done (unless it has been skipped by
1250 the callback).
1251 """
1252 # Parse the Reset
1253 ref = self._parse_ref_line(b'reset')
1254 self._exported_refs.add(ref)
1255 ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
1256 if self._currentline == b'\n':
1257 self._advance_currentline()
1258
1259 # fast-export likes to print extraneous resets that serve no purpose.
1260 # While we could continue processing such resets, that is a waste of
1261 # resources. Also, we want to avoid recording that this ref was
1262 # seen in such cases, since this ref could be rewritten to nothing.
1263 if not from_ref:
1264 self._latest_commit.pop(ref, None)
1265 self._latest_orig_commit.pop(ref, None)
1266 return
1267
1268 # Create the reset
1269 reset = Reset(ref, from_ref)
1270
1271 # Call any user callback to allow them to modify the reset
1272 if self._reset_callback:
1273 self._reset_callback(reset)
1274
1275 # Update metadata
1276 self._latest_commit[reset.ref] = reset.from_ref
1277 self._latest_orig_commit[reset.ref] = reset.from_ref
1278
1279 # Now print the resulting reset
1280 if not reset.dumped:
1281 self._imported_refs.add(reset.ref)
1282 reset.dump(self._output)
1283
1284 def _parse_commit(self):
1285 """
1286 Parse input data into a Commit object. Once the Commit has been created,
1287 it will be handed off to the appropriate callbacks. Current-line will
1288 be advanced until it is beyond the commit data. The Commit will be dumped
1289 to _output once everything else is done (unless it has been skipped by
1290 the callback OR the callback has removed all file-changes from the commit).
1291 """
1292 # Parse the Commit. This may look involved, but it's pretty simple; it only
1293 # looks bad because a commit object contains many pieces of data.
1294 branch = self._parse_ref_line(b'commit')
1295 self._exported_refs.add(branch)
1296 id_ = self._parse_optional_mark()
1297
1298 original_id = None
1299 if self._currentline.startswith(b'original-oid'):
1300 original_id = self._parse_original_id();
1301
1302 author_name = None
1303 author_email = None
1304 if self._currentline.startswith(b'author'):
1305 (author_name, author_email, author_date) = self._parse_user(b'author')
1306
1307 (committer_name, committer_email, committer_date) = \
1308 self._parse_user(b'committer')
1309
1310 if not author_name and not author_email:
1311 (author_name, author_email, author_date) = \
1312 (committer_name, committer_email, committer_date)
1313
1314 encoding = None
1315 if self._currentline.startswith(b'encoding '):
1316 encoding = self._parse_encoding()
1317
1318 commit_msg = self._parse_data()
1319
1320 pinfo = [self._parse_optional_parent_ref(b'from')]
1321 # Due to empty pruning, we can have real 'from' and 'merge' lines that
1322 # due to commit rewriting map to a parent of None. We need to record
1323 # 'from' if its non-None, and we need to parse all 'merge' lines.
1324 while self._currentline.startswith(b'merge '):
1325 pinfo.append(self._parse_optional_parent_ref(b'merge'))
1326 orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)]
1327
1328 # No parents is oddly represented as [None] instead of [], due to the
1329 # special 'from' handling. Convert it here to a more canonical form.
1330 if parents == [None]:
1331 parents = []
1332 if orig_parents == [None]:
1333 orig_parents = []
1334
1335 # fast-import format is kinda stupid in that it allows implicit parents
1336 # based on the branch name instead of requiring them to be specified by
1337 # 'from' directives. The only way to get no parent is by using a reset
1338 # directive first, which clears the latest_commit_for_this_branch tracking.
1339 if not orig_parents and self._latest_commit.get(branch):
1340 parents = [self._latest_commit[branch]]
1341 if not orig_parents and self._latest_orig_commit.get(branch):
1342 orig_parents = [self._latest_orig_commit[branch]]
1343
1344 # Get the list of file changes
1345 file_changes = []
1346 file_change = self._parse_optional_filechange()
1347 had_file_changes = file_change is not None
1348 while file_change:
1349 if not (type(file_change) == bytes and file_change == b'skipped'):
1350 file_changes.append(file_change)
1351 file_change = self._parse_optional_filechange()
1352 if self._currentline == b'\n':
1353 self._advance_currentline()
1354
1355 # Okay, now we can finally create the Commit object
1356 commit = Commit(branch,
1357 author_name, author_email, author_date,
1358 committer_name, committer_email, committer_date,
1359 commit_msg, file_changes, parents, original_id, encoding)
1360
1361 # If fast-export text had a mark for this commit, need to make sure this
1362 # mark translates to the commit's true id.
1363 if id_:
1364 commit.old_id = id_
1365 _IDS.record_rename(id_, commit.id)
1366
1367 # refs/notes/ put commit-message-related material in blobs, and name their
1368 # files according to the hash of other commits. That totally messes with
1369 # all normal callbacks; fast-export should really export these as different
1370 # kinds of objects. Until then, let's just pass these commits through as-is
1371 # and hope the blob callbacks don't mess things up.
1372 if commit.branch.startswith(b'refs/notes/'):
1373 self._imported_refs.add(commit.branch)
1374 commit.dump(self._output)
1375 return
1376
1377 # Call any user callback to allow them to modify the commit
1378 aux_info = {'orig_parents': orig_parents,
1379 'had_file_changes': had_file_changes}
1380 if self._commit_callback:
1381 self._commit_callback(commit, aux_info)
1382
1383 # Now print the resulting commit, or if prunable skip it
1384 self._latest_orig_commit[branch] = commit.id
1385 if not (commit.old_id or commit.id) in _SKIPPED_COMMITS:
1386 self._latest_commit[branch] = commit.id
1387 if not commit.dumped:
1388 self._imported_refs.add(commit.branch)
1389 commit.dump(self._output)
1390
1391 def _parse_tag(self):
1392 """
1393 Parse input data into a Tag object. Once the Tag has been created,
1394 it will be handed off to the appropriate callbacks. Current-line will
1395 be advanced until it is beyond the tag data. The Tag will be dumped
1396 to _output once everything else is done (unless it has been skipped by
1397 the callback).
1398 """
1399 # Parse the Tag
1400 tag = self._parse_ref_line(b'tag')
1401 self._exported_refs.add(b'refs/tags/'+tag)
1402 id_ = self._parse_optional_mark()
1403 ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
1404
1405 original_id = None
1406 if self._currentline.startswith(b'original-oid'):
1407 original_id = self._parse_original_id();
1408
1409 tagger_name, tagger_email, tagger_date = None, None, None
1410 if self._currentline.startswith(b'tagger'):
1411 (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger')
1412 tag_msg = self._parse_data()
1413 if self._currentline == b'\n':
1414 self._advance_currentline()
1415
1416 # Create the tag
1417 tag = Tag(tag, from_ref,
1418 tagger_name, tagger_email, tagger_date, tag_msg,
1419 original_id)
1420
1421 # If fast-export text had a mark for this tag, need to make sure this
1422 # mark translates to the tag's true id.
1423 if id_:
1424 tag.old_id = id_
1425 _IDS.record_rename(id_, tag.id)
1426
1427 # Call any user callback to allow them to modify the tag
1428 if self._tag_callback:
1429 self._tag_callback(tag)
1430
1431 # The tag might not point at anything that still exists (self.from_ref
1432 # will be None if the commit it pointed to and all its ancestors were
1433 # pruned due to being empty)
1434 if tag.from_ref:
1435 # Print out this tag's information
1436 if not tag.dumped:
1437 self._imported_refs.add(b'refs/tags/'+tag.ref)
1438 tag.dump(self._output)
1439 else:
1440 tag.skip()
1441
1442 def _parse_progress(self):
1443 """
1444 Parse input data into a Progress object. Once the Progress has
1445 been created, it will be handed off to the appropriate
1446 callbacks. Current-line will be advanced until it is beyond the
1447 progress data. The Progress will be dumped to _output once
1448 everything else is done (unless it has been skipped by the callback).
1449 """
1450 # Parse the Progress
1451 message = self._parse_ref_line(b'progress')
1452 if self._currentline == b'\n':
1453 self._advance_currentline()
1454
1455 # Create the progress message
1456 progress = Progress(message)
1457
1458 # Call any user callback to allow them to modify the progress messsage
1459 if self._progress_callback:
1460 self._progress_callback(progress)
1461
1462 # NOTE: By default, we do NOT print the progress message; git
1463 # fast-import would write it to fast_import_pipes which could mess with
1464 # our parsing of output from the 'ls' and 'get-mark' directives we send
1465 # to fast-import. If users want these messages, they need to process
1466 # and handle them in the appropriate callback above.
1467
1468 def _parse_checkpoint(self):
1469 """
1470 Parse input data into a Checkpoint object. Once the Checkpoint has
1471 been created, it will be handed off to the appropriate
1472 callbacks. Current-line will be advanced until it is beyond the
1473 checkpoint data. The Checkpoint will be dumped to _output once
1474 everything else is done (unless it has been skipped by the callback).
1475 """
1476 # Parse the Checkpoint
1477 self._advance_currentline()
1478 if self._currentline == b'\n':
1479 self._advance_currentline()
1480
1481 # Create the checkpoint
1482 checkpoint = Checkpoint()
1483
1484 # Call any user callback to allow them to drop the checkpoint
1485 if self._checkpoint_callback:
1486 self._checkpoint_callback(checkpoint)
1487
1488 # NOTE: By default, we do NOT print the checkpoint message; although it
1489 # we would only realistically get them with --stdin, the fact that we
1490 # are filtering makes me think the checkpointing is less likely to be
1491 # reasonable. In fact, I don't think it's necessary in general. If
1492 # users do want it, they should process it in the checkpoint_callback.
1493
1494 def _parse_literal_command(self):
1495 """
1496 Parse literal command. Then just dump the line as is.
1497 """
1498 # Create the literal command object
1499 command = LiteralCommand(self._currentline)
1500 self._advance_currentline()
1501
1502 # Now print the resulting literal command
1503 if not command.dumped:
1504 command.dump(self._output)
1505
1506 def insert(self, obj):
1507 assert not obj.dumped
1508 obj.dump(self._output)
1509 if type(obj) == Commit:
1510 self._imported_refs.add(obj.branch)
1511 elif type(obj) in (Reset, Tag):
1512 self._imported_refs.add(obj.ref)
1513
1514 def run(self, input, output):
1515 """
1516 This method filters fast export output.
1517 """
1518 # Set input. If no args provided, use stdin.
1519 self._input = input
1520 self._output = output
1521
1522 # Run over the input and do the filtering
1523 self._advance_currentline()
1524 while self._currentline:
1525 if self._currentline.startswith(b'blob'):
1526 self._parse_blob()
1527 elif self._currentline.startswith(b'reset'):
1528 self._parse_reset()
1529 elif self._currentline.startswith(b'commit'):
1530 self._parse_commit()
1531 elif self._currentline.startswith(b'tag'):
1532 self._parse_tag()
1533 elif self._currentline.startswith(b'progress'):
1534 self._parse_progress()
1535 elif self._currentline.startswith(b'checkpoint'):
1536 self._parse_checkpoint()
1537 elif self._currentline.startswith(b'feature'):
1538 self._parse_literal_command()
1539 elif self._currentline.startswith(b'option'):
1540 self._parse_literal_command()
1541 elif self._currentline.startswith(b'done'):
1542 if self._done_callback:
1543 self._done_callback()
1544 self._parse_literal_command()
1545 # Prevent confusion from others writing additional stuff that'll just
1546 # be ignored
1547 self._output.close()
1548 elif self._currentline.startswith(b'#'):
1549 self._parse_literal_command()
1550 elif self._currentline.startswith(b'get-mark') or \
1551 self._currentline.startswith(b'cat-blob') or \
1552 self._currentline.startswith(b'ls'):
1553 raise SystemExit(_("Unsupported command: '%s'") % self._currentline)
1554 else:
1555 raise SystemExit(_("Could not parse line: '%s'") % self._currentline)
1556
1557 def get_exported_and_imported_refs(self):
1558 return self._exported_refs, self._imported_refs
1559
1560def record_id_rename(old_id, new_id):
1561 """
1562 Register a new translation
1563 """
1564 handle_transitivity = True
1565 _IDS.record_rename(old_id, new_id, handle_transitivity)
1566
1567# Internal globals
1568_IDS = _IDs()
1569_SKIPPED_COMMITS = set()
1570BLOB_HASH_TO_NEW_ID = {}
1571BLOB_NEW_ID_TO_HASH = {}
1572sdr_next_steps = _("""
1573NEXT STEPS FOR YOUR SENSITIVE DATA REMOVAL:
1574 * If you are doing your rewrite in multiple steps, ignore these next steps
1575 until you have completed all your invocations of git-filter-repo.
1576 * See the "Sensitive Data Removal" subsection of the "DISCUSSION" section
1577 of the manual for more details about any of the steps below.
1578 * Inspect this repository and verify that the sensitive data is indeed
1579 completely removed from all commits.
1580 * Force push the rewritten history to the server:
1581 %s
1582 * Contact the server admins for additional steps they need to take; the
1583 First Changed Commit(s)%s may come in handy here.
1584 * Have other colleagues with a clone either discard their clone and reclone
1585 OR follow the detailed steps in the manual to repeatedly rebase and
1586 purge the sensitive data from their copy. Again, the First Changed
1587 Commit(s)%s may come in handy.
1588 * See the "Prevent repeats and avoid future sensitive data spills" section
1589 of the manual.
1590"""[1:])
1591
1592class SubprocessWrapper(object):
1593 @staticmethod
1594 def decodify(args):
1595 if type(args) == str:
1596 return args
1597 else:
1598 assert type(args) == list
1599 return [decode(x) if type(x)==bytes else x for x in args]
1600
1601 @staticmethod
1602 def call(*args, **kwargs):
1603 if 'cwd' in kwargs:
1604 kwargs['cwd'] = decode(kwargs['cwd'])
1605 return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs)
1606
1607 @staticmethod
1608 def check_output(*args, **kwargs):
1609 if 'cwd' in kwargs:
1610 kwargs['cwd'] = decode(kwargs['cwd'])
1611 return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs)
1612
1613 @staticmethod
1614 def check_call(*args, **kwargs): # pragma: no cover # used by filter-lamely
1615 if 'cwd' in kwargs:
1616 kwargs['cwd'] = decode(kwargs['cwd'])
1617 return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs)
1618
1619 @staticmethod
1620 def Popen(*args, **kwargs):
1621 if 'cwd' in kwargs:
1622 kwargs['cwd'] = decode(kwargs['cwd'])
1623 return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs)
1624
1625subproc = subprocess
1626if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ:
1627 subproc = SubprocessWrapper
1628
1629class GitUtils(object):
1630 @staticmethod
1631 def get_commit_count(repo, *args):
1632 """
1633 Return the number of commits that have been made on repo.
1634 """
1635 if not args:
1636 args = ['--all']
1637 if len(args) == 1 and isinstance(args[0], list):
1638 args = args[0]
1639 p = subproc.Popen(["git", "rev-list", "--count"] + args,
1640 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1641 cwd=repo)
1642 if p.wait() != 0:
1643 raise SystemExit(_("%s does not appear to be a valid git repository")
1644 % decode(repo))
1645 return int(p.stdout.read())
1646
1647 @staticmethod
1648 def get_total_objects(repo):
1649 """
1650 Return the number of objects (both packed and unpacked)
1651 """
1652 p1 = subproc.Popen(["git", "count-objects", "-v"],
1653 stdout=subprocess.PIPE, cwd=repo)
1654 lines = p1.stdout.read().splitlines()
1655 # Return unpacked objects + packed-objects
1656 return int(lines[0].split()[1]) + int(lines[2].split()[1])
1657
1658 @staticmethod
1659 def is_repository_bare(repo_working_dir):
1660 out = subproc.check_output('git rev-parse --is-bare-repository'.split(),
1661 cwd=repo_working_dir)
1662 return (out.strip() == b'true')
1663
1664 @staticmethod
1665 def determine_git_dir(repo_working_dir):
1666 d = subproc.check_output('git rev-parse --git-dir'.split(),
1667 cwd=repo_working_dir).strip()
1668 if repo_working_dir==b'.' or d.startswith(b'/'):
1669 return d
1670 return os.path.join(repo_working_dir, d)
1671
1672 @staticmethod
1673 def get_refs(repo_working_dir):
1674 try:
1675 output = subproc.check_output('git show-ref'.split(),
1676 cwd=repo_working_dir)
1677 except subprocess.CalledProcessError as e:
1678 # If error code is 1, there just aren't any refs; i.e. new repo.
1679 # If error code is other than 1, some other error (e.g. not a git repo)
1680 if e.returncode != 1:
1681 raise SystemExit('fatal: {}'.format(e))
1682 output = ''
1683 return dict(reversed(x.split()) for x in output.splitlines())
1684
1685 @staticmethod
1686 def get_config_settings(repo_working_dir):
1687 output = ''
1688 try:
1689 output = subproc.check_output('git config --list --null'.split(),
1690 cwd=repo_working_dir)
1691 except subprocess.CalledProcessError as e: # pragma: no cover
1692 raise SystemExit('fatal: {}'.format(e))
1693
1694 # FIXME: Ignores multi-valued keys, just let them overwrite for now
1695 return dict(item.split(b'\n', maxsplit=1)
1696 for item in output.strip().split(b"\0") if item)
1697
1698 @staticmethod
1699 def get_blob_sizes(quiet = False):
1700 blob_size_progress = ProgressWriter()
1701 num_blobs = 0
1702 processed_blobs_msg = _("Processed %d blob sizes")
1703
1704 # Get sizes of blobs by sha1
1705 cmd = '--batch-check=%(objectname) %(objecttype) ' + \
1706 '%(objectsize) %(objectsize:disk)'
1707 cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
1708 bufsize = -1,
1709 stdout = subprocess.PIPE)
1710 unpacked_size = {}
1711 packed_size = {}
1712 for line in cf.stdout:
1713 try:
1714 sha, objtype, objsize, objdisksize = line.split()
1715 objsize, objdisksize = int(objsize), int(objdisksize)
1716 if objtype == b'blob':
1717 unpacked_size[sha] = objsize
1718 packed_size[sha] = objdisksize
1719 num_blobs += 1
1720 except ValueError: # pragma: no cover
1721 sys.stderr.write(_("Error: unexpected `git cat-file` output: \"%s\"\n") % line)
1722 if not quiet:
1723 blob_size_progress.show(processed_blobs_msg % num_blobs)
1724 cf.wait()
1725 if not quiet:
1726 blob_size_progress.finish()
1727 return unpacked_size, packed_size
1728
1729 @staticmethod
1730 def get_file_changes(repo, parent_hash, commit_hash):
1731 """
1732 Return a FileChanges list with the differences between parent_hash
1733 and commit_hash
1734 """
1735 file_changes = []
1736
1737 cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash]
1738 output = subproc.check_output(cmd, cwd=repo)
1739 for line in output.splitlines():
1740 fileinfo, path = line.split(b'\t', 1)
1741 if path.startswith(b'"'):
1742 path = PathQuoting.dequote(path)
1743 oldmode, mode, oldhash, newhash, changetype = fileinfo.split()
1744 if changetype == b'D':
1745 file_changes.append(FileChange(b'D', path))
1746 elif changetype in (b'A', b'M', b'T'):
1747 identifier = BLOB_HASH_TO_NEW_ID.get(newhash, newhash)
1748 file_changes.append(FileChange(b'M', path, identifier, mode))
1749 else: # pragma: no cover
1750 raise SystemExit("Unknown change type for line {}".format(line))
1751
1752 return file_changes
1753
1754 @staticmethod
1755 def print_my_version():
1756 with open(__file__, 'br') as f:
1757 contents = f.read()
1758 # If people replaced @@LOCALEDIR@@ string to point at their local
1759 # directory, undo it so we can get original source version.
1760 contents = re.sub(br'\A#\!.*',
1761 br'#!/usr/bin/env python3', contents)
1762 contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"',
1763 br'\1@@LOCALEDIR@@"', contents)
1764
1765 cmd = 'git hash-object --stdin'.split()
1766 version = subproc.check_output(cmd, input=contents).strip()
1767 print(decode(version[0:12]))
1768
1769class FilteringOptions(object):
1770 default_replace_text = b'***REMOVED***'
1771 class AppendFilter(argparse.Action):
1772 def __call__(self, parser, namespace, values, option_string=None):
1773 user_path = values
1774 suffix = option_string[len('--path-'):] or 'match'
1775 if suffix.startswith('rename'):
1776 mod_type = 'rename'
1777 match_type = option_string[len('--path-rename-'):] or 'match'
1778 values = values.split(b':')
1779 if len(values) != 2:
1780 raise SystemExit(_("Error: --path-rename expects one colon in its"
1781 " argument: <old_name:new_name>."))
1782 if values[0] and values[1] and not (
1783 values[0].endswith(b'/') == values[1].endswith(b'/')):
1784 raise SystemExit(_("Error: With --path-rename, if OLD_NAME and "
1785 "NEW_NAME are both non-empty and either ends "
1786 "with a slash then both must."))
1787 if any(v.startswith(b'/') for v in values):
1788 raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
1789 components = values[0].split(b'/') + values[1].split(b'/')
1790 else:
1791 mod_type = 'filter'
1792 match_type = suffix
1793 components = values.split(b'/')
1794 if values.startswith(b'/'):
1795 raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
1796 for illegal_path in [b'.', b'..']:
1797 if illegal_path in components:
1798 raise SystemExit(_("Error: Invalid path component '%s' found in '%s'")
1799 % (decode(illegal_path), decode(user_path)))
1800 if match_type == 'regex':
1801 values = re.compile(values)
1802 items = getattr(namespace, self.dest, []) or []
1803 items.append((mod_type, match_type, values))
1804 if (match_type, mod_type) == ('glob', 'filter'):
1805 if not values.endswith(b'*'):
1806 extension = b'*' if values.endswith(b'/') else b'/*'
1807 items.append((mod_type, match_type, values+extension))
1808 setattr(namespace, self.dest, items)
1809
1810 class HelperFilter(argparse.Action):
1811 def __call__(self, parser, namespace, values, option_string=None):
1812 af = FilteringOptions.AppendFilter(dest='path_changes',
1813 option_strings=None)
1814 dirname = values if values[-1:] == b'/' else values+b'/'
1815 if option_string == '--subdirectory-filter':
1816 af(parser, namespace, dirname, '--path-match')
1817 af(parser, namespace, dirname+b':', '--path-rename')
1818 elif option_string == '--to-subdirectory-filter':
1819 af(parser, namespace, b':'+dirname, '--path-rename')
1820 else:
1821 raise SystemExit(_("Error: HelperFilter given invalid option_string: %s")
1822 % option_string) # pragma: no cover
1823
1824 class FileWithPathsFilter(argparse.Action):
1825 def __call__(self, parser, namespace, values, option_string=None):
1826 if not namespace.path_changes:
1827 namespace.path_changes = []
1828 namespace.path_changes += FilteringOptions.get_paths_from_file(values)
1829
1830 @staticmethod
1831 def create_arg_parser():
1832 # Include usage in the summary, so we can put the description first
1833 summary = _('''Rewrite (or analyze) repository history
1834
1835 git-filter-repo destructively rewrites history (unless --analyze or
1836 --dry-run are given) according to specified rules. It refuses to do any
1837 rewriting unless either run from a clean fresh clone, or --force was
1838 given.
1839
1840 Basic Usage:
1841 git-filter-repo --analyze
1842 git-filter-repo [FILTER/RENAME/CONTROL OPTIONS]
1843
1844 See EXAMPLES section for details.
1845 ''').rstrip()
1846
1847 # Provide a long helpful examples section
1848 example_text = _('''CALLBACKS
1849
1850 Most callback functions are of the same general format. For a command line
1851 argument like
1852 --foo-callback 'BODY'
1853
1854 the following code will be compiled and called:
1855 def foo_callback(foo):
1856 BODY
1857
1858 The exception on callbacks is the --file-info-callback, which will be
1859 discussed further below.
1860
1861 Given the callback style, we can thus make a simple callback to replace
1862 'Jon' with 'John' in author/committer/tagger names:
1863 git filter-repo --name-callback 'return name.replace(b"Jon", b"John")'
1864
1865 To remove all 'Tested-by' tags in commit (or tag) messages:
1866 git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)'
1867
1868 To remove all .DS_Store files:
1869 git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename'
1870
1871 Note that if BODY resolves to a filename, then the contents of that file
1872 will be used as the BODY in the callback function.
1873
1874 The --file-info-callback has a more involved function callback; for it the
1875 following code will be compiled and called:
1876 def file_info_callback(filename, mode, blob_id, value):
1877 BODY
1878
1879 It is designed to be used in cases where filtering depends on both
1880 filename and contents (and maybe mode). It is called for file changes
1881 other than deletions (since deletions have no file contents to operate
1882 on). This callback is expected to return a tuple of (filename, mode,
1883 blob_id). It can make use of the following functions from the value
1884 instance:
1885 value.get_contents_by_identifier(blob_id) -> contents (bytestring)
1886 value.get_size_by_identifier(blob_id) -> size_of_blob (int)
1887 value.insert_file_with_contents(contents) -> blob_id
1888 value.is_binary(contents) -> bool
1889 value.apply_replace_text(contents) -> new_contents (bytestring)
1890 and can read/write the following data member from the value instance:
1891 value.data (dict)
1892
1893 The filename can be used for renaming the file similar to
1894 --filename-callback (or None to drop the change), and mode is one
1895 of b'100644', b'100755', b'120000', or b'160000'.
1896
1897 For more detailed examples and explanations AND caveats, see
1898 https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS
1899
1900EXAMPLES
1901
1902 To get a bunch of reports mentioning renames that have occurred in
1903 your repo and listing sizes of objects aggregated by any of path,
1904 directory, extension, or blob-id:
1905 git filter-repo --analyze
1906
1907 (These reports can help you choose how to filter your repo; it can
1908 be useful to re-run this command after filtering to regenerate the
1909 report and verify the changes look correct.)
1910
1911 To extract the history that touched just 'guides' and 'tools/releases':
1912 git filter-repo --path guides/ --path tools/releases
1913
1914 To remove foo.zip and bar/baz/zips from every revision in history:
1915 git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths
1916
1917 To replace the text 'password' with 'p455w0rd':
1918 git filter-repo --replace-text <(echo "password==>p455w0rd")
1919
1920 To use the current version of the .mailmap file to update authors,
1921 committers, and taggers throughout history and make it permanent:
1922 git filter-repo --use-mailmap
1923
1924 To extract the history of 'src/', rename all files to have a new leading
1925 directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and
1926 add a 'my-module-' prefix to all tags:
1927 git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-'
1928
1929 For more detailed examples and explanations, see
1930 https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''')
1931
1932 # Create the basic parser
1933 parser = argparse.ArgumentParser(description=summary,
1934 usage = argparse.SUPPRESS,
1935 add_help = False,
1936 epilog = example_text,
1937 formatter_class=argparse.RawDescriptionHelpFormatter)
1938
1939 analyze = parser.add_argument_group(title=_("Analysis"))
1940 analyze.add_argument('--analyze', action='store_true',
1941 help=_("Analyze repository history and create a report that may be "
1942 "useful in determining what to filter in a subsequent run. "
1943 "Will not modify your repo."))
1944 analyze.add_argument('--report-dir',
1945 metavar='DIR_OR_FILE',
1946 type=os.fsencode,
1947 dest='report_dir',
1948 help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis,"
1949 "refuses to run if exists, --force delete existing dir first."))
1950
1951 path = parser.add_argument_group(title=_("Filtering based on paths "
1952 "(see also --filename-callback)"),
1953 description=textwrap.dedent(_("""
1954 These options specify the paths to select. Note that much like git
1955 itself, renames are NOT followed so you may need to specify multiple
1956 paths, e.g. `--path olddir/ --path newdir/`
1957 """[1:])))
1958
1959 path.add_argument('--invert-paths', action='store_false', dest='inclusive',
1960 help=_("Invert the selection of files from the specified "
1961 "--path-{match,glob,regex} options below, i.e. only select "
1962 "files matching none of those options."))
1963
1964 path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE',
1965 type=os.fsencode,
1966 action=FilteringOptions.AppendFilter, dest='path_changes',
1967 help=_("Exact paths (files or directories) to include in filtered "
1968 "history. Multiple --path options can be specified to get "
1969 "a union of paths."))
1970 path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode,
1971 action=FilteringOptions.AppendFilter, dest='path_changes',
1972 help=_("Glob of paths to include in filtered history. Multiple "
1973 "--path-glob options can be specified to get a union of "
1974 "paths."))
1975 path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode,
1976 action=FilteringOptions.AppendFilter, dest='path_changes',
1977 help=_("Regex of paths to include in filtered history. Multiple "
1978 "--path-regex options can be specified to get a union of "
1979 "paths"))
1980 path.add_argument('--use-base-name', action='store_true',
1981 help=_("Match on file base name instead of full path from the top "
1982 "of the repo. Incompatible with --path-rename, and "
1983 "incompatible with matching against directory names."))
1984
1985 rename = parser.add_argument_group(title=_("Renaming based on paths "
1986 "(see also --filename-callback)"))
1987 rename.add_argument('--path-rename', '--path-rename-match',
1988 metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode,
1989 action=FilteringOptions.AppendFilter,
1990 help=_("Path to rename; if filename or directory matches OLD_NAME "
1991 "rename to NEW_NAME. Multiple --path-rename options can be "
1992 "specified. NOTE: If you combine filtering options with "
1993 "renaming ones, do not rely on a rename argument to select "
1994 "paths; you also need a filter to select them."))
1995
1996 helpers = parser.add_argument_group(title=_("Path shortcuts"))
1997 helpers.add_argument('--paths', help=argparse.SUPPRESS, metavar='IGNORE')
1998 helpers.add_argument('--paths-from-file', metavar='FILENAME',
1999 type=os.fsencode,
2000 action=FilteringOptions.FileWithPathsFilter, dest='path_changes',
2001 help=_("Specify several path filtering and renaming directives, one "
2002 "per line. Lines with '==>' in them specify path renames, "
2003 "and lines can begin with 'literal:' (the default), 'glob:', "
2004 "or 'regex:' to specify different matching styles. Blank "
2005 "lines and lines starting with a '#' are ignored."))
2006 helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY',
2007 action=FilteringOptions.HelperFilter, type=os.fsencode,
2008 help=_("Only look at history that touches the given subdirectory "
2009 "and treat that directory as the project root. Equivalent "
2010 "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'"))
2011 helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY',
2012 action=FilteringOptions.HelperFilter, type=os.fsencode,
2013 help=_("Treat the project root as if it were under DIRECTORY. "
2014 "Equivalent to using '--path-rename :DIRECTORY/'"))
2015
2016 contents = parser.add_argument_group(title=_("Content editing filters "
2017 "(see also --blob-callback)"))
2018 contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE',
2019 help=_("A file with expressions that, if found, will be replaced. "
2020 "By default, each expression is treated as literal text, "
2021 "but 'regex:' and 'glob:' prefixes are supported. You can "
2022 "end the line with '==>' and some replacement text to "
2023 "choose a replacement choice other than the default of '{}'."
2024 .format(decode(FilteringOptions.default_replace_text))))
2025 contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE',
2026 dest='max_blob_size', default=0,
2027 help=_("Strip blobs (files) bigger than specified size (e.g. '5M', "
2028 "'2G', etc)"))
2029 contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME',
2030 help=_("Read git object ids from each line of the given file, and "
2031 "strip all of them from history"))
2032
2033 refrename = parser.add_argument_group(title=_("Renaming of refs "
2034 "(see also --refname-callback)"))
2035 refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode,
2036 help=_("Rename tags starting with OLD to start with NEW. For "
2037 "example, --tag-rename foo:bar will rename tag foo-1.2.3 "
2038 "to bar-1.2.3; either OLD or NEW can be empty."))
2039
2040 messages = parser.add_argument_group(title=_("Filtering of commit messages "
2041 "(see also --message-callback)"))
2042 messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE',
2043 help=_("A file with expressions that, if found in commit or tag "
2044 "messages, will be replaced. This file uses the same syntax "
2045 "as --replace-text."))
2046 messages.add_argument('--preserve-commit-hashes', action='store_true',
2047 help=_("By default, since commits are rewritten and thus gain new "
2048 "hashes, references to old commit hashes in commit messages "
2049 "are replaced with new commit hashes (abbreviated to the same "
2050 "length as the old reference). Use this flag to turn off "
2051 "updating commit hashes in commit messages."))
2052 messages.add_argument('--preserve-commit-encoding', action='store_true',
2053 help=_("Do not reencode commit messages into UTF-8. By default, if "
2054 "the commit object specifies an encoding for the commit "
2055 "message, the message is re-encoded into UTF-8."))
2056
2057 people = parser.add_argument_group(title=_("Filtering of names & emails "
2058 "(see also --name-callback "
2059 "and --email-callback)"))
2060 people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME',
2061 type=os.fsencode,
2062 help=_("Use specified mailmap file (see git-shortlog(1) for "
2063 "details on the format) when rewriting author, committer, "
2064 "and tagger names and emails. If the specified file is "
2065 "part of git history, historical versions of the file will "
2066 "be ignored; only the current contents are consulted."))
2067 people.add_argument('--use-mailmap', dest='mailmap',
2068 action='store_const', const=b'.mailmap',
2069 help=_("Same as: '--mailmap .mailmap' "))
2070
2071 parents = parser.add_argument_group(title=_("Parent rewriting"))
2072 parents.add_argument('--replace-refs', default=None,
2073 choices=['delete-no-add', 'delete-and-add',
2074 'update-no-add', 'update-or-add',
2075 'update-and-add', 'old-default'],
2076 help=_("How to handle replace refs (see git-replace(1)). Replace "
2077 "refs can be added during the history rewrite as a way to "
2078 "allow users to pass old commit IDs (from before "
2079 "git-filter-repo was run) to git commands and have git know "
2080 "how to translate those old commit IDs to the new "
2081 "(post-rewrite) commit IDs. Also, replace refs that existed "
2082 "before the rewrite can either be deleted or updated. The "
2083 "choices to pass to --replace-refs thus need to specify both "
2084 "what to do with existing refs and what to do with commit "
2085 "rewrites. Thus 'update-and-add' means to update existing "
2086 "replace refs, and for any commit rewrite (even if already "
2087 "pointed at by a replace ref) add a new refs/replace/ reference "
2088 "to map from the old commit ID to the new commit ID. The "
2089 "default is update-no-add, meaning update existing replace refs "
2090 "but do not add any new ones. There is also a special "
2091 "'old-default' option for picking the default used in versions "
2092 "prior to git-filter-repo-2.45, namely 'update-and-add' upon "
2093 "the first run of git-filter-repo in a repository and "
2094 "'update-or-add' if running git-filter-repo again on a "
2095 "repository."))
2096 parents.add_argument('--prune-empty', default='auto',
2097 choices=['always', 'auto', 'never'],
2098 help=_("Whether to prune empty commits. 'auto' (the default) means "
2099 "only prune commits which become empty (not commits which were "
2100 "empty in the original repo, unless their parent was pruned). "
2101 "When the parent of a commit is pruned, the first non-pruned "
2102 "ancestor becomes the new parent."))
2103 parents.add_argument('--prune-degenerate', default='auto',
2104 choices=['always', 'auto', 'never'],
2105 help=_("Since merge commits are needed for history topology, they "
2106 "are typically exempt from pruning. However, they can become "
2107 "degenerate with the pruning of other commits (having fewer "
2108 "than two parents, having one commit serve as both parents, or "
2109 "having one parent as the ancestor of the other.) If such "
2110 "merge commits have no file changes, they can be pruned. The "
2111 "default ('auto') is to only prune empty merge commits which "
2112 "become degenerate (not which started as such)."))
2113 parents.add_argument('--no-ff', action='store_true',
2114 help=_("Even if the first parent is or becomes an ancestor of another "
2115 "parent, do not prune it. This modifies how "
2116 "--prune-degenerate behaves, and may be useful in projects who "
2117 "always use merge --no-ff."))
2118
2119 callback = parser.add_argument_group(title=_("Generic callback code snippets"))
2120 callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE",
2121 help=_("Python code body for processing filenames; see CALLBACKS "
2122 "sections below."))
2123 callback.add_argument('--file-info-callback', metavar="FUNCTION_BODY_OR_FILE",
2124 help=_("Python code body for processing file and metadata; see "
2125 "CALLBACKS sections below."))
2126 callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE",
2127 help=_("Python code body for processing messages (both commit "
2128 "messages and tag messages); see CALLBACKS section below."))
2129 callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE",
2130 help=_("Python code body for processing names of people; see "
2131 "CALLBACKS section below."))
2132 callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE",
2133 help=_("Python code body for processing emails addresses; see "
2134 "CALLBACKS section below."))
2135 callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE",
2136 help=_("Python code body for processing refnames; see CALLBACKS "
2137 "section below."))
2138
2139 callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE",
2140 help=_("Python code body for processing blob objects; see "
2141 "CALLBACKS section below."))
2142 callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE",
2143 help=_("Python code body for processing commit objects; see "
2144 "CALLBACKS section below."))
2145 callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE",
2146 help=_("Python code body for processing tag objects. Note that "
2147 "lightweight tags have no tag object and are thus not "
2148 "handled by this callback. See CALLBACKS section below."))
2149 callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE",
2150 help=_("Python code body for processing reset objects; see "
2151 "CALLBACKS section below."))
2152
2153 sdr = parser.add_argument_group(title=_("Sensitive Data Removal Handling"))
2154 sdr.add_argument('--sensitive-data-removal', '--sdr', action='store_true',
2155 help=_("This rewrite is intended to remove sensitive data from a "
2156 "repository. Gather extra information from the rewrite needed "
2157 "to provide additional instructions on how to clean up other "
2158 "copies."))
2159 sdr.add_argument('--no-fetch', action='store_true',
2160 help=_("By default, --sensitive-data-removal will trigger a "
2161 "mirror-like fetch of all refs from origin, discarding local "
2162 "changes, but ensuring that _all_ fetchable refs that hold on "
2163 "to the sensitve data are rewritten. This flag removes that "
2164 "fetch, risking that other refs continue holding on to the "
2165 "sensitive data. This option is implied by --partial or any "
2166 "flag that implies --partial."))
2167
2168 desc = _(
2169 "Specifying alternate source or target locations implies --partial,\n"
2170 "except that the normal default for --replace-refs is used. However,\n"
2171 "unlike normal uses of --partial, this doesn't risk mixing old and new\n"
2172 "history since the old and new histories are in different repositories.")
2173 location = parser.add_argument_group(title=_("Location to filter from/to"),
2174 description=desc)
2175 location.add_argument('--source', type=os.fsencode,
2176 help=_("Git repository to read from"))
2177 location.add_argument('--target', type=os.fsencode,
2178 help=_("Git repository to overwrite with filtered history"))
2179
2180 order = parser.add_argument_group(title=_("Ordering of commits"))
2181 order.add_argument('--date-order', action='store_true',
2182 help=_("Processes commits in commit timestamp order."))
2183
2184 misc = parser.add_argument_group(title=_("Miscellaneous options"))
2185 misc.add_argument('--help', '-h', action='store_true',
2186 help=_("Show this help message and exit."))
2187 misc.add_argument('--version', action='store_true',
2188 help=_("Display filter-repo's version and exit."))
2189 misc.add_argument('--proceed', action='store_true',
2190 help=_("Avoid triggering the no-arguments-specified check."))
2191 misc.add_argument('--force', '-f', action='store_true',
2192 help=_("Rewrite repository history even if the current repo does not "
2193 "look like a fresh clone. History rewriting is irreversible "
2194 "(and includes immediate pruning of reflogs and old objects), "
2195 "so be cautious about using this flag."))
2196 misc.add_argument('--partial', action='store_true',
2197 help=_("Do a partial history rewrite, resulting in the mixture of "
2198 "old and new history. This disables rewriting "
2199 "refs/remotes/origin/* to refs/heads/*, disables removing "
2200 "of the 'origin' remote, disables removing unexported refs, "
2201 "disables expiring the reflog, and disables the automatic "
2202 "post-filter gc. Also, this modifies --tag-rename and "
2203 "--refname-callback options such that instead of replacing "
2204 "old refs with new refnames, it will instead create new "
2205 "refs and keep the old ones around. Use with caution."))
2206 misc.add_argument('--no-gc', action='store_true',
2207 help=_("Do not run 'git gc' after filtering."))
2208 # WARNING: --refs presents a problem with become-degenerate pruning:
2209 # * Excluding a commit also excludes its ancestors so when some other
2210 # commit has an excluded ancestor as a parent we have no way of
2211 # knowing what it is an ancestor of without doing a special
2212 # full-graph walk.
2213 misc.add_argument('--refs', nargs='+',
2214 help=_("Limit history rewriting to the specified refs. Implies "
2215 "--partial. In addition to the normal caveats of --partial "
2216 "(mixing old and new history, no automatic remapping of "
2217 "refs/remotes/origin/* to refs/heads/*, etc.), this also may "
2218 "cause problems for pruning of degenerate empty merge "
2219 "commits when negative revisions are specified."))
2220
2221 misc.add_argument('--dry-run', action='store_true',
2222 help=_("Do not change the repository. Run `git fast-export` and "
2223 "filter its output, and save both the original and the "
2224 "filtered version for comparison. This also disables "
2225 "rewriting commit messages due to not knowing new commit "
2226 "IDs and disables filtering of some empty commits due to "
2227 "inability to query the fast-import backend." ))
2228 misc.add_argument('--debug', action='store_true',
2229 help=_("Print additional information about operations being "
2230 "performed and commands being run. When used together "
2231 "with --dry-run, also show extra information about what "
2232 "would be run."))
2233 # WARNING: --state-branch has some problems:
2234 # * It does not work well with manually inserted objects (user creating
2235 # Blob() or Commit() or Tag() objects and calling
2236 # RepoFilter.insert(obj) on them).
2237 # * It does not work well with multiple source or multiple target repos
2238 # * It doesn't work so well with pruning become-empty commits (though
2239 # --refs doesn't work so well with it either)
2240 # These are probably fixable, given some work (e.g. re-importing the
2241 # graph at the beginning to get the AncestryGraph right, doing our own
2242 # export of marks instead of using fast-export --export-marks, etc.), but
2243 # for now just hide the option.
2244 misc.add_argument('--state-branch',
2245 #help=_("Enable incremental filtering by saving the mapping of old "
2246 # "to new objects to the specified branch upon exit, and"
2247 # "loading that mapping from that branch (if it exists) "
2248 # "upon startup."))
2249 help=argparse.SUPPRESS)
2250 misc.add_argument('--stdin', action='store_true',
2251 help=_("Instead of running `git fast-export` and filtering its "
2252 "output, filter the fast-export stream from stdin. The "
2253 "stdin must be in the expected input format (e.g. it needs "
2254 "to include original-oid directives)."))
2255 misc.add_argument('--quiet', action='store_true',
2256 help=_("Pass --quiet to other git commands called"))
2257 return parser
2258
2259 @staticmethod
2260 def sanity_check_args(args):
2261 if args.analyze and args.path_changes:
2262 raise SystemExit(_("Error: --analyze is incompatible with --path* flags; "
2263 "it's a read-only operation."))
2264 if args.analyze and args.stdin:
2265 raise SystemExit(_("Error: --analyze is incompatible with --stdin."))
2266 # If no path_changes are found, initialize with empty list but mark as
2267 # not inclusive so that all files match
2268 if args.path_changes == None:
2269 args.path_changes = []
2270 args.inclusive = False
2271 else:
2272 # Similarly, if we have no filtering paths, then no path should be
2273 # filtered out. Based on how newname() works, the easiest way to
2274 # achieve that is setting args.inclusive to False.
2275 if not any(x[0] == 'filter' for x in args.path_changes):
2276 args.inclusive = False
2277 # Also check for incompatible --use-base-name and --path-rename flags.
2278 if args.use_base_name:
2279 if any(x[0] == 'rename' for x in args.path_changes):
2280 raise SystemExit(_("Error: --use-base-name and --path-rename are "
2281 "incompatible."))
2282 # Also throw some sanity checks on git version here;
2283 # PERF: remove these checks once new enough git versions are common
2284 p = subproc.Popen('git fast-export -h'.split(),
2285 stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2286 output = p.stdout.read()
2287 if b'--anonymize-map' not in output: # pragma: no cover
2288 global date_format_permissive
2289 date_format_permissive = False
2290 if not any(x in output for x in [b'--mark-tags',b'--[no-]mark-tags']): # pragma: no cover
2291 global write_marks
2292 write_marks = False
2293 if args.state_branch:
2294 # We need a version of git-fast-export with --mark-tags
2295 raise SystemExit(_("Error: need git >= 2.24.0"))
2296 if not any(x in output for x in [b'--reencode', b'--[no-]reencode']): # pragma: no cover
2297 if args.preserve_commit_encoding:
2298 # We need a version of git-fast-export with --reencode
2299 raise SystemExit(_("Error: need git >= 2.23.0"))
2300 else:
2301 # Set args.preserve_commit_encoding to None which we'll check for later
2302 # to avoid passing --reencode=yes to fast-export (that option was the
2303 # default prior to git-2.23)
2304 args.preserve_commit_encoding = None
2305 # If we don't have fast-exoprt --reencode, we may also be missing
2306 # diff-tree --combined-all-paths, which is even more important...
2307 p = subproc.Popen('git diff-tree -h'.split(),
2308 stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2309 output = p.stdout.read()
2310 if b'--combined-all-paths' not in output:
2311 # We need a version of git-diff-tree with --combined-all-paths
2312 raise SystemExit(_("Error: need git >= 2.22.0"))
2313 if args.sensitive_data_removal:
2314 p = subproc.Popen('git cat-file -h'.split(),
2315 stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2316 output = p.stdout.read()
2317 if b"--batch-command" not in output: # pragma: no cover
2318 raise SystemExit(_("Error: need git >= 2.36.0"))
2319 # End of sanity checks on git version
2320 if args.max_blob_size:
2321 suffix = args.max_blob_size[-1]
2322 if suffix not in '1234567890':
2323 mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
2324 if suffix not in mult:
2325 raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than"
2326 " argument %s")
2327 % args.max_blob_size)
2328 args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix]
2329 else:
2330 args.max_blob_size = int(args.max_blob_size)
2331 if args.file_info_callback and (
2332 args.stdin or args.blob_callback or args.filename_callback):
2333 raise SystemExit(_("Error: --file-info-callback is incompatible with "
2334 "--stdin, --blob-callback,\nand --filename-callback."))
2335
2336 @staticmethod
2337 def get_replace_text(filename):
2338 replace_literals = []
2339 replace_regexes = []
2340 with open(filename, 'br') as f:
2341 for line in f:
2342 line = line.rstrip(b'\r\n')
2343
2344 # Determine the replacement
2345 replacement = FilteringOptions.default_replace_text
2346 if b'==>' in line:
2347 line, replacement = line.rsplit(b'==>', 1)
2348
2349 # See if we need to match via regex
2350 regex = None
2351 if line.startswith(b'regex:'):
2352 regex = line[6:]
2353 elif line.startswith(b'glob:'):
2354 regex = glob_to_regex(line[5:])
2355 if regex:
2356 replace_regexes.append((re.compile(regex), replacement))
2357 else:
2358 # Otherwise, find the literal we need to replace
2359 if line.startswith(b'literal:'):
2360 line = line[8:]
2361 if not line:
2362 continue
2363 replace_literals.append((line, replacement))
2364 return {'literals': replace_literals, 'regexes': replace_regexes}
2365
2366 @staticmethod
2367 def get_paths_from_file(filename):
2368 new_path_changes = []
2369 with open(filename, 'br') as f:
2370 for line in f:
2371 line = line.rstrip(b'\r\n')
2372
2373 # Skip blank lines
2374 if not line:
2375 continue
2376 # Skip comment lines
2377 if line.startswith(b'#'):
2378 continue
2379
2380 # Determine the replacement
2381 match_type, repl = 'literal', None
2382 if b'==>' in line:
2383 line, repl = line.rsplit(b'==>', 1)
2384
2385 # See if we need to match via regex
2386 match_type = 'match' # a.k.a. 'literal'
2387 if line.startswith(b'regex:'):
2388 match_type = 'regex'
2389 match = re.compile(line[6:])
2390 elif line.startswith(b'glob:'):
2391 match_type = 'glob'
2392 match = line[5:]
2393 if repl:
2394 raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename)))
2395 else:
2396 if line.startswith(b'literal:'):
2397 match = line[8:]
2398 else:
2399 match = line
2400 if repl is not None:
2401 if match and repl and match.endswith(b'/') != repl.endswith(b'/'):
2402 raise SystemExit(_("Error: When rename directories, if OLDNAME "
2403 "and NEW_NAME are both non-empty and either "
2404 "ends with a slash then both must."))
2405
2406 # Record the filter or rename
2407 if repl is not None:
2408 new_path_changes.append(['rename', match_type, (match, repl)])
2409 else:
2410 new_path_changes.append(['filter', match_type, match])
2411 if match_type == 'glob' and not match.endswith(b'*'):
2412 extension = b'*' if match.endswith(b'/') else b'/*'
2413 new_path_changes.append(['filter', match_type, match+extension])
2414 return new_path_changes
2415
2416 @staticmethod
2417 def default_options():
2418 return FilteringOptions.parse_args([], error_on_empty = False)
2419
2420 @staticmethod
2421 def parse_args(input_args, error_on_empty = True):
2422 parser = FilteringOptions.create_arg_parser()
2423 if not input_args and error_on_empty:
2424 parser.print_usage()
2425 raise SystemExit(_("No arguments specified."))
2426 args = parser.parse_args(input_args)
2427 if args.help:
2428 parser.print_help()
2429 raise SystemExit()
2430 if args.paths:
2431 raise SystemExit("Error: Option `--paths` unrecognized; did you mean --path or --paths-from-file?")
2432 if args.version:
2433 GitUtils.print_my_version()
2434 raise SystemExit()
2435 FilteringOptions.sanity_check_args(args)
2436 if args.mailmap:
2437 args.mailmap = MailmapInfo(args.mailmap)
2438 if args.replace_text:
2439 args.replace_text = FilteringOptions.get_replace_text(args.replace_text)
2440 if args.replace_message:
2441 args.replace_message = FilteringOptions.get_replace_text(args.replace_message)
2442 if args.strip_blobs_with_ids:
2443 with open(args.strip_blobs_with_ids, 'br') as f:
2444 args.strip_blobs_with_ids = set(f.read().split())
2445 else:
2446 args.strip_blobs_with_ids = set()
2447 if (args.partial or args.refs) and not args.replace_refs:
2448 args.replace_refs = 'update-no-add'
2449 args.repack = not (args.partial or args.refs or args.no_gc)
2450 if args.refs or args.source or args.target:
2451 args.partial = True
2452 if args.partial:
2453 args.no_fetch = True
2454 if not args.refs:
2455 args.refs = ['--all']
2456 return args
2457
2458class RepoAnalyze(object):
2459
2460 # First, several helper functions for analyze_commit()
2461
2462 @staticmethod
2463 def equiv_class(stats, filename):
2464 return stats['equivalence'].get(filename, (filename,))
2465
2466 @staticmethod
2467 def setup_equivalence_for_rename(stats, oldname, newname):
2468 # if A is renamed to B and B is renamed to C, then the user thinks of
2469 # A, B, and C as all being different names for the same 'file'. We record
2470 # this as an equivalence class:
2471 # stats['equivalence'][name] = (A,B,C)
2472 # for name being each of A, B, and C.
2473 old_tuple = stats['equivalence'].get(oldname, ())
2474 if newname in old_tuple:
2475 return
2476 elif old_tuple:
2477 new_tuple = tuple(list(old_tuple)+[newname])
2478 else:
2479 new_tuple = (oldname, newname)
2480 for f in new_tuple:
2481 stats['equivalence'][f] = new_tuple
2482
2483 @staticmethod
2484 def setup_or_update_rename_history(stats, commit, oldname, newname):
2485 rename_commits = stats['rename_history'].get(oldname, set())
2486 rename_commits.add(commit)
2487 stats['rename_history'][oldname] = rename_commits
2488
2489 @staticmethod
2490 def handle_renames(stats, commit, change_types, filenames):
2491 for index, change_type in enumerate(change_types):
2492 if change_type == ord(b'R'):
2493 oldname, newname = filenames[index], filenames[-1]
2494 RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
2495 RepoAnalyze.setup_or_update_rename_history(stats, commit,
2496 oldname, newname)
2497
2498 @staticmethod
2499 def handle_file(stats, graph, commit, modes, shas, filenames):
2500 mode, sha, filename = modes[-1], shas[-1], filenames[-1]
2501
2502 # Figure out kind of deletions to undo for this file, and update lists
2503 # of all-names-by-sha and all-filenames
2504 delmode = 'tree_deletions'
2505 if mode != b'040000':
2506 delmode = 'file_deletions'
2507 stats['names'][sha].add(filename)
2508 stats['allnames'].add(filename)
2509
2510 # If the file (or equivalence class of files) was recorded as deleted,
2511 # clearly it isn't anymore
2512 equiv = RepoAnalyze.equiv_class(stats, filename)
2513 for f in equiv:
2514 stats[delmode].pop(f, None)
2515
2516 # If we get a modify/add for a path that was renamed, we may need to break
2517 # the equivalence class. However, if the modify/add was on a branch that
2518 # doesn't have the rename in its history, we are still okay.
2519 need_to_break_equivalence = False
2520 if equiv[-1] != filename:
2521 for rename_commit in stats['rename_history'][filename]:
2522 if graph.is_ancestor(rename_commit, commit):
2523 need_to_break_equivalence = True
2524
2525 if need_to_break_equivalence:
2526 for f in equiv:
2527 if f in stats['equivalence']:
2528 del stats['equivalence'][f]
2529
2530 @staticmethod
2531 def analyze_commit(stats, graph, commit, parents, date, file_changes):
2532 graph.add_commit_and_parents(commit, parents)
2533 for change in file_changes:
2534 modes, shas, change_types, filenames = change
2535 if len(parents) == 1 and change_types.startswith(b'R'):
2536 change_types = b'R' # remove the rename score; we don't care
2537 if modes[-1] == b'160000':
2538 continue
2539 elif modes[-1] == b'000000':
2540 # Track when files/directories are deleted
2541 for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
2542 if any(x == b'040000' for x in modes[0:-1]):
2543 stats['tree_deletions'][f] = date
2544 else:
2545 stats['file_deletions'][f] = date
2546 elif change_types.strip(b'AMT') == b'':
2547 RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2548 elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'':
2549 RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2550 elif change_types.strip(b'RAMT') == b'':
2551 RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2552 RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
2553 else:
2554 raise SystemExit(_("Unhandled change type(s): %(change_type)s "
2555 "(in commit %(commit)s)")
2556 % ({'change_type': change_types, 'commit': commit})
2557 ) # pragma: no cover
2558
2559 @staticmethod
2560 def gather_data(args):
2561 unpacked_size, packed_size = GitUtils.get_blob_sizes()
2562 stats = {'names': collections.defaultdict(set),
2563 'allnames' : set(),
2564 'file_deletions': {},
2565 'tree_deletions': {},
2566 'equivalence': {},
2567 'rename_history': collections.defaultdict(set),
2568 'unpacked_size': unpacked_size,
2569 'packed_size': packed_size,
2570 'num_commits': 0}
2571
2572 # Setup the rev-list/diff-tree process
2573 processed_commits_msg = _("Processed %d commits")
2574 commit_parse_progress = ProgressWriter()
2575 num_commits = 0
2576 cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
2577 ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
2578 ' --date=short -M -t -c --raw --combined-all-paths')
2579 dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
2580 f = dtp.stdout
2581 line = f.readline()
2582 if not line:
2583 raise SystemExit(_("Nothing to analyze; repository is empty."))
2584 cont = bool(line)
2585 graph = AncestryGraph()
2586 while cont:
2587 commit = line.rstrip()
2588 parents = f.readline().split()
2589 date = f.readline().rstrip()
2590
2591 # We expect a blank line next; if we get a non-blank line then
2592 # this commit modified no files and we need to move on to the next.
2593 # If there is no line, we've reached end-of-input.
2594 line = f.readline()
2595 if not line:
2596 cont = False
2597 line = line.rstrip()
2598
2599 # If we haven't reached end of input, and we got a blank line meaning
2600 # a commit that has modified files, then get the file changes associated
2601 # with this commit.
2602 file_changes = []
2603 if cont and not line:
2604 cont = False
2605 for line in f:
2606 if not line.startswith(b':'):
2607 cont = True
2608 break
2609 n = 1+max(1, len(parents))
2610 assert line.startswith(b':'*(n-1))
2611 relevant = line[n-1:-1]
2612 splits = relevant.split(None, n)
2613 modes = splits[0:n]
2614 splits = splits[n].split(None, n)
2615 shas = splits[0:n]
2616 splits = splits[n].split(b'\t')
2617 change_types = splits[0]
2618 filenames = [PathQuoting.dequote(x) for x in splits[1:]]
2619 file_changes.append([modes, shas, change_types, filenames])
2620
2621 # If someone is trying to analyze a subset of the history, make sure
2622 # to avoid dying on commits with parents that we haven't seen before
2623 if args.refs:
2624 graph.record_external_commits([p for p in parents
2625 if not p in graph.value])
2626
2627 # Analyze this commit and update progress
2628 RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
2629 file_changes)
2630 num_commits += 1
2631 commit_parse_progress.show(processed_commits_msg % num_commits)
2632
2633 # Show the final commits processed message and record the number of commits
2634 commit_parse_progress.finish()
2635 stats['num_commits'] = num_commits
2636
2637 # Close the output, ensure rev-list|diff-tree pipeline completed successfully
2638 dtp.stdout.close()
2639 if dtp.wait():
2640 raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover
2641
2642 return stats
2643
2644 @staticmethod
2645 def write_report(reportdir, stats):
2646 def datestr(datetimestr):
2647 return datetimestr if datetimestr else _('<present>').encode()
2648
2649 def dirnames(path):
2650 while True:
2651 path = os.path.dirname(path)
2652 yield path
2653 if path == b'':
2654 break
2655
2656 # Compute aggregate size information for paths, extensions, and dirs
2657 total_size = {'packed': 0, 'unpacked': 0}
2658 path_size = {'packed': collections.defaultdict(int),
2659 'unpacked': collections.defaultdict(int)}
2660 ext_size = {'packed': collections.defaultdict(int),
2661 'unpacked': collections.defaultdict(int)}
2662 dir_size = {'packed': collections.defaultdict(int),
2663 'unpacked': collections.defaultdict(int)}
2664 for sha in stats['names']:
2665 size = {'packed': stats['packed_size'][sha],
2666 'unpacked': stats['unpacked_size'][sha]}
2667 for which in ('packed', 'unpacked'):
2668 for name in stats['names'][sha]:
2669 total_size[which] += size[which]
2670 path_size[which][name] += size[which]
2671 basename, ext = os.path.splitext(name)
2672 ext_size[which][ext] += size[which]
2673 for dirname in dirnames(name):
2674 dir_size[which][dirname] += size[which]
2675
2676 # Determine if and when extensions and directories were deleted
2677 ext_deleted_data = {}
2678 for name in stats['allnames']:
2679 when = stats['file_deletions'].get(name, None)
2680
2681 # Update the extension
2682 basename, ext = os.path.splitext(name)
2683 if when is None:
2684 ext_deleted_data[ext] = None
2685 elif ext in ext_deleted_data:
2686 if ext_deleted_data[ext] is not None:
2687 ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
2688 else:
2689 ext_deleted_data[ext] = when
2690
2691 dir_deleted_data = {}
2692 for name in dir_size['packed']:
2693 dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
2694
2695 with open(os.path.join(reportdir, b"README"), 'bw') as f:
2696 # Give a basic overview of this file
2697 f.write(b"== %s ==\n" % _("Overall Statistics").encode())
2698 f.write((" %s: %d\n" % (_("Number of commits"),
2699 stats['num_commits'])).encode())
2700 f.write((" %s: %d\n" % (_("Number of filenames"),
2701 len(path_size['packed']))).encode())
2702 f.write((" %s: %d\n" % (_("Number of directories"),
2703 len(dir_size['packed']))).encode())
2704 f.write((" %s: %d\n" % (_("Number of file extensions"),
2705 len(ext_size['packed']))).encode())
2706 f.write(b"\n")
2707 f.write((" %s: %d\n" % (_("Total unpacked size (bytes)"),
2708 total_size['unpacked'])).encode())
2709 f.write((" %s: %d\n" % (_("Total packed size (bytes)"),
2710 total_size['packed'])).encode())
2711 f.write(b"\n")
2712
2713 # Mention issues with the report
2714 f.write(("== %s ==\n" % _("Caveats")).encode())
2715 f.write(("=== %s ===\n" % _("Sizes")).encode())
2716 f.write(textwrap.dedent(_("""
2717 Packed size represents what size your repository would be if no
2718 trees, commits, tags, or other metadata were included (though it may
2719 fail to represent de-duplication; see below). It also represents the
2720 current packing, which may be suboptimal if you haven't gc'ed for a
2721 while.
2722
2723 Unpacked size represents what size your repository would be if no
2724 trees, commits, tags, or other metadata were included AND if no
2725 files were packed; i.e., without delta-ing or compression.
2726
2727 Both unpacked and packed sizes can be slightly misleading. Deleting
2728 a blob from history not save as much space as the unpacked size,
2729 because it is obviously normally stored in packed form. Also,
2730 deleting a blob from history may not save as much space as its packed
2731 size either, because another blob could be stored as a delta against
2732 that blob, so when you remove one blob another blob's packed size may
2733 grow.
2734
2735 Also, the sum of the packed sizes can add up to more than the
2736 repository size; if the same contents appeared in the repository in
2737 multiple places, git will automatically de-dupe and store only one
2738 copy, while the way sizes are added in this analysis adds the size
2739 for each file path that has those contents. Further, if a file is
2740 ever reverted to a previous version's contents, the previous
2741 version's size will be counted multiple times in this analysis, even
2742 though git will only store it once.
2743 """)[1:]).encode())
2744 f.write(b"\n")
2745 f.write(("=== %s ===\n" % _("Deletions")).encode())
2746 f.write(textwrap.dedent(_("""
2747 Whether a file is deleted is not a binary quality, since it can be
2748 deleted on some branches but still exist in others. Also, it might
2749 exist in an old tag, but have been deleted in versions newer than
2750 that. More thorough tracking could be done, including looking at
2751 merge commits where one side of history deleted and the other modified,
2752 in order to give a more holistic picture of deletions. However, that
2753 algorithm would not only be more complex to implement, it'd also be
2754 quite difficult to present and interpret by users. Since --analyze
2755 is just about getting a high-level rough picture of history, it instead
2756 implements the simplistic rule that is good enough for 98% of cases:
2757 A file is marked as deleted if the last commit in the fast-export
2758 stream that mentions the file lists it as deleted.
2759 This makes it dependent on topological ordering, but generally gives
2760 the "right" answer.
2761 """)[1:]).encode())
2762 f.write(b"\n")
2763 f.write(("=== %s ===\n" % _("Renames")).encode())
2764 f.write(textwrap.dedent(_("""
2765 Renames share the same non-binary nature that deletions do, plus
2766 additional challenges:
2767 * If the renamed file is renamed again, instead of just two names for
2768 a path you can have three or more.
2769 * Rename pairs of the form (oldname, newname) that we consider to be
2770 different names of the "same file" might only be valid over certain
2771 commit ranges. For example, if a new commit reintroduces a file
2772 named oldname, then new versions of oldname aren't the "same file"
2773 anymore. We could try to portray this to the user, but it's easier
2774 for the user to just break the pairing and only report unbroken
2775 rename pairings to the user.
2776 * The ability for users to rename files differently in different
2777 branches means that our chains of renames will not necessarily be
2778 linear but may branch out.
2779 """)[1:]).encode())
2780 f.write(b"\n")
2781
2782 # Equivalence classes for names, so if folks only want to keep a
2783 # certain set of paths, they know the old names they want to include
2784 # too.
2785 with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f:
2786 seen = set()
2787 for pathname,equiv_group in sorted(stats['equivalence'].items(),
2788 key=lambda x:(x[1], x[0])):
2789 if equiv_group in seen:
2790 continue
2791 seen.add(equiv_group)
2792 f.write(("{} ->\n ".format(decode(equiv_group[0])) +
2793 "\n ".join(decode(x) for x in equiv_group[1:]) +
2794 "\n").encode())
2795
2796 # List directories in reverse sorted order of unpacked size
2797 with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f:
2798 msg = "=== %s ===\n" % _("Deleted directories by reverse size")
2799 f.write(msg.encode())
2800 msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
2801 f.write(msg.encode())
2802 for dirname, size in sorted(dir_size['packed'].items(),
2803 key=lambda x:(x[1],x[0]), reverse=True):
2804 if (dir_deleted_data[dirname]):
2805 f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
2806 size,
2807 datestr(dir_deleted_data[dirname]),
2808 dirname or _('<toplevel>').encode()))
2809
2810 with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f:
2811 f.write(("=== %s ===\n" % _("All directories by reverse size")).encode())
2812 msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
2813 f.write(msg.encode())
2814 for dirname, size in sorted(dir_size['packed'].items(),
2815 key=lambda x:(x[1],x[0]), reverse=True):
2816 f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
2817 size,
2818 datestr(dir_deleted_data[dirname]),
2819 dirname or _("<toplevel>").encode()))
2820
2821 # List extensions in reverse sorted order of unpacked size
2822 with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f:
2823 msg = "=== %s ===\n" % _("Deleted extensions by reverse size")
2824 f.write(msg.encode())
2825 msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
2826 f.write(msg.encode())
2827 for extname, size in sorted(ext_size['packed'].items(),
2828 key=lambda x:(x[1],x[0]), reverse=True):
2829 if (ext_deleted_data[extname]):
2830 f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
2831 size,
2832 datestr(ext_deleted_data[extname]),
2833 extname or _('<no extension>').encode()))
2834
2835 with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f:
2836 f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode())
2837 msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
2838 f.write(msg.encode())
2839 for extname, size in sorted(ext_size['packed'].items(),
2840 key=lambda x:(x[1],x[0]), reverse=True):
2841 f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
2842 size,
2843 datestr(ext_deleted_data[extname]),
2844 extname or _('<no extension>').encode()))
2845
2846 # List files in reverse sorted order of unpacked size
2847 with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f:
2848 msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size")
2849 f.write(msg.encode())
2850 msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n")
2851 f.write(msg.encode())
2852 for pathname, size in sorted(path_size['packed'].items(),
2853 key=lambda x:(x[1],x[0]), reverse=True):
2854 when = stats['file_deletions'].get(pathname, None)
2855 if when:
2856 f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
2857 size,
2858 datestr(when),
2859 pathname))
2860
2861 with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f:
2862 msg = "=== %s ===\n" % _("All paths by reverse accumulated size")
2863 f.write(msg.encode())
2864 msg = _("Format: unpacked size, packed size, date deleted, path name\n")
2865 f.write(msg.encode())
2866 for pathname, size in sorted(path_size['packed'].items(),
2867 key=lambda x:(x[1],x[0]), reverse=True):
2868 when = stats['file_deletions'].get(pathname, None)
2869 f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
2870 size,
2871 datestr(when),
2872 pathname))
2873
2874 # List of filenames and sizes in descending order
2875 with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f:
2876 f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode())
2877 f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode())
2878 for sha, size in sorted(stats['packed_size'].items(),
2879 key=lambda x:(x[1],x[0]), reverse=True):
2880 if sha not in stats['names']:
2881 # Some objects in the repository might not be referenced, or not
2882 # referenced by the branches/tags the user cares about; skip them.
2883 continue
2884 names_with_sha = stats['names'][sha]
2885 if len(names_with_sha) == 1:
2886 names_with_sha = names_with_sha.pop()
2887 else:
2888 names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']'
2889 f.write(b" %s %10d %10d %s\n" % (sha,
2890 stats['unpacked_size'][sha],
2891 size,
2892 names_with_sha))
2893
2894 @staticmethod
2895 def run(args):
2896 if args.report_dir:
2897 reportdir = args.report_dir
2898 else:
2899 git_dir = GitUtils.determine_git_dir(b'.')
2900
2901 # Create the report directory as necessary
2902 results_tmp_dir = os.path.join(git_dir, b'filter-repo')
2903 if not os.path.isdir(results_tmp_dir):
2904 os.mkdir(results_tmp_dir)
2905 reportdir = os.path.join(results_tmp_dir, b"analysis")
2906
2907 if os.path.isdir(reportdir):
2908 if args.force:
2909 sys.stdout.write(_("Warning: Removing recursively: \"%s\"\n") % decode(reportdir))
2910 shutil.rmtree(reportdir)
2911 else:
2912 sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir))
2913 sys.exit(1)
2914
2915 os.mkdir(reportdir)
2916
2917 # Gather the data we need
2918 stats = RepoAnalyze.gather_data(args)
2919
2920 # Write the reports
2921 sys.stdout.write(_("Writing reports to \"%s\"...") % decode(reportdir))
2922 sys.stdout.flush()
2923 RepoAnalyze.write_report(reportdir, stats)
2924 sys.stdout.write(_("done.\n"))
2925 sys.stdout.write(_("README: \"%s\"\n") % decode( os.path.join(reportdir, b"README") ))
2926
2927class FileInfoValueHelper:
2928 def __init__(self, replace_text, insert_blob_func, source_working_dir):
2929 self.data = {}
2930 self._replace_text = replace_text
2931 self._insert_blob_func = insert_blob_func
2932 cmd = ['git', 'cat-file', '--batch-command']
2933 self._cat_file_process = subproc.Popen(cmd,
2934 stdin = subprocess.PIPE,
2935 stdout = subprocess.PIPE,
2936 cwd = source_working_dir)
2937
2938 def finalize(self):
2939 self._cat_file_process.stdin.close()
2940 self._cat_file_process.wait()
2941
2942 def get_contents_by_identifier(self, blobhash):
2943 self._cat_file_process.stdin.write(b'contents '+blobhash+b'\n')
2944 self._cat_file_process.stdin.flush()
2945 line = self._cat_file_process.stdout.readline()
2946 try:
2947 (oid, oidtype, size) = line.split()
2948 except ValueError:
2949 assert(line == blobhash+b" missing\n")
2950 return None
2951 size = int(size) # Convert e.g. b'6283' to 6283
2952 assert(oidtype == b'blob')
2953 contents_plus_newline = self._cat_file_process.stdout.read(size+1)
2954 return contents_plus_newline[:-1] # return all but the newline
2955
2956 def get_size_by_identifier(self, blobhash):
2957 self._cat_file_process.stdin.write(b'info '+blobhash+b'\n')
2958 self._cat_file_process.stdin.flush()
2959 line = self._cat_file_process.stdout.readline()
2960 (oid, oidtype, size) = line.split()
2961 size = int(size) # Convert e.g. b'6283' to 6283
2962 assert(oidtype == b'blob')
2963 return size
2964
2965 def insert_file_with_contents(self, contents):
2966 blob = Blob(contents)
2967 self._insert_blob_func(blob)
2968 return blob.id
2969
2970 def is_binary(self, contents):
2971 return b"\0" in contents[0:8192]
2972
2973 def apply_replace_text(self, contents):
2974 new_contents = contents
2975 for literal, replacement in self._replace_text['literals']:
2976 new_contents = new_contents.replace(literal, replacement)
2977 for regex, replacement in self._replace_text['regexes']:
2978 new_contents = regex.sub(replacement, new_contents)
2979 return new_contents
2980
2981class LFSObjectTracker:
2982 class LFSObjs:
2983 def __init__(self):
2984 self.id_to_object_map = {}
2985 self.objects = set()
2986
2987 def __init__(self, file_info, check_sources, check_targets):
2988 self.source_objects = LFSObjectTracker.LFSObjs()
2989 self.target_objects = LFSObjectTracker.LFSObjs()
2990 self.hash_to_object_map = {}
2991 self.file_info = file_info
2992 self.check_sources = check_sources
2993 self.check_targets = check_targets
2994 self.objects_orphaned = False
2995
2996 def _get_lfs_values(self, contents):
2997 values = {}
2998 if len(contents) > 1024:
2999 return {}
3000 for line in contents.splitlines():
3001 try:
3002 (key, value) = line.split(b' ', 1)
3003 except ValueError:
3004 return {}
3005 if not values and key != b'version':
3006 return values
3007 values[key] = value
3008 return values
3009
3010 def check_blob_data(self, contents, fast_export_id, source):
3011 if source and not self.check_sources:
3012 return
3013 mymap = self.source_objects if source else self.target_objects
3014 lfs_object_id = self._get_lfs_values(contents).get(b'oid')
3015 if lfs_object_id:
3016 mymap.id_to_object_map[fast_export_id] = lfs_object_id
3017
3018 def check_file_change_data(self, git_id, source):
3019 if source and not self.check_sources:
3020 return
3021 mymap = self.source_objects if source else self.target_objects
3022 if isinstance(git_id, int):
3023 lfs_object_id = mymap.id_to_object_map.get(git_id)
3024 if lfs_object_id:
3025 mymap.objects.add(lfs_object_id)
3026 else:
3027 if git_id in self.hash_to_object_map:
3028 mymap.objects.add(self.hash_to_object_map[git_id])
3029 return
3030 size = self.file_info.get_size_by_identifier(git_id)
3031 if size >= 1024:
3032 return
3033 contents = self.file_info.get_contents_by_identifier(git_id)
3034 lfs_object_id = self._get_lfs_values(contents).get(b'oid')
3035 if lfs_object_id:
3036 self.hash_to_object_map[git_id] = lfs_object_id
3037 mymap.objects.add(lfs_object_id)
3038
3039 def check_output_object(self, obj):
3040 if not self.check_targets:
3041 return
3042 if type(obj) == Blob:
3043 self.check_blob_data(obj.data, obj.id, False)
3044 elif type(obj) == Commit:
3045 for change in obj.file_changes:
3046 sys.stdout.flush()
3047 if change.type != b'M' or change.mode == b'160000':
3048 continue
3049 self.check_file_change_data(change.blob_id, False)
3050
3051 def find_all_lfs_objects_in_repo(self, repo, source):
3052 if not source:
3053 self.file_info = FileInfoValueHelper(None, None, repo)
3054 p = subproc.Popen(["git", "rev-list", "--objects", "--all"],
3055 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3056 cwd=repo)
3057 for line in p.stdout.readlines():
3058 try:
3059 (git_oid, filename) = line.split()
3060 except ValueError:
3061 # Commit and tree objects only have oid
3062 continue
3063
3064 mymap = self.source_objects if source else self.target_objects
3065 size = self.file_info.get_size_by_identifier(git_oid)
3066 if size >= 1024:
3067 continue
3068 contents = self.file_info.get_contents_by_identifier(git_oid)
3069 lfs_object_id = self._get_lfs_values(contents).get(b'oid')
3070 if lfs_object_id:
3071 mymap.objects.add(lfs_object_id)
3072 if not source:
3073 self.file_info.finalize()
3074
3075class InputFileBackup:
3076 def __init__(self, input_file, output_file):
3077 self.input_file = input_file
3078 self.output_file = output_file
3079
3080 def close(self):
3081 self.input_file.close()
3082 self.output_file.close()
3083
3084 def read(self, size):
3085 output = self.input_file.read(size)
3086 self.output_file.write(output)
3087 return output
3088
3089 def readline(self):
3090 line = self.input_file.readline()
3091 self.output_file.write(line)
3092 return line
3093
3094class DualFileWriter:
3095 def __init__(self, file1, file2):
3096 self.file1 = file1
3097 self.file2 = file2
3098
3099 def write(self, *args):
3100 self.file1.write(*args)
3101 self.file2.write(*args)
3102
3103 def flush(self):
3104 self.file1.flush()
3105 self.file2.flush()
3106
3107 def close(self):
3108 self.file1.close()
3109 self.file2.close()
3110
3111class RepoFilter(object):
3112 def __init__(self,
3113 args,
3114 filename_callback = None,
3115 message_callback = None,
3116 name_callback = None,
3117 email_callback = None,
3118 refname_callback = None,
3119 blob_callback = None,
3120 commit_callback = None,
3121 tag_callback = None,
3122 reset_callback = None,
3123 done_callback = None,
3124 file_info_callback = None):
3125
3126 self._args = args
3127
3128 # Repo we are exporting
3129 self._repo_working_dir = None
3130
3131 # Store callbacks for acting on objects printed by FastExport
3132 self._blob_callback = blob_callback
3133 self._commit_callback = commit_callback
3134 self._tag_callback = tag_callback
3135 self._reset_callback = reset_callback
3136 self._done_callback = done_callback
3137
3138 # Store callbacks for acting on slices of FastExport objects
3139 self._filename_callback = filename_callback # filenames from commits
3140 self._message_callback = message_callback # commit OR tag message
3141 self._name_callback = name_callback # author, committer, tagger
3142 self._email_callback = email_callback # author, committer, tagger
3143 self._refname_callback = refname_callback # from commit/tag/reset
3144 self._file_info_callback = file_info_callback # various file info
3145 self._handle_arg_callbacks()
3146
3147 # Helpers for callbacks
3148 self._file_info_value = None
3149
3150 # Defaults for input
3151 self._input = None
3152 self._fep = None # Fast Export Process
3153 self._fe_orig = None # Path to where original fast-export output stored
3154 self._fe_filt = None # Path to where filtered fast-export output stored
3155 self._parser = None # FastExportParser object we are working with
3156
3157 # Defaults for output
3158 self._output = None
3159 self._fip = None # Fast Import Process
3160 self._import_pipes = None
3161 self._managed_output = True
3162
3163 # A tuple of (depth, list-of-ancestors). Commits and ancestors are
3164 # identified by their id (their 'mark' in fast-export or fast-import
3165 # speak). The depth of a commit is one more than the max depth of any
3166 # of its ancestors.
3167 self._graph = AncestryGraph()
3168 # Another one, for ancestry of commits in the original repo
3169 self._orig_graph = AncestryGraph()
3170
3171 # Names of files that were tweaked in any commit; such paths could lead
3172 # to subsequent commits being empty
3173 self._files_tweaked = set()
3174
3175 # A set of commit hash pairs (oldhash, newhash) which used to be merge
3176 # commits but due to filtering were turned into non-merge commits.
3177 # The commits probably have suboptimal commit messages (e.g. "Merge branch
3178 # next into master").
3179 self._commits_no_longer_merges = []
3180
3181 # A dict of original_ids to new_ids; filtering commits means getting
3182 # new commit hash (sha1sums), and we record the mapping both for
3183 # diagnostic purposes and so we can rewrite commit messages. Note that
3184 # the new_id can be None rather than a commit hash if the original
3185 # commit became empty and was pruned or was otherwise dropped.
3186 self._commit_renames = {}
3187
3188 # A set of original_ids (i.e. original hashes) for which we have not yet
3189 # gotten the new hashses; the value is always the corresponding fast-export
3190 # id (i.e. commit.id)
3191 self._pending_renames = collections.OrderedDict()
3192
3193 # A dict of commit_hash[0:7] -> set(commit_hashes with that prefix).
3194 #
3195 # It's common for commit messages to refer to commits by abbreviated
3196 # commit hashes, as short as 7 characters. To facilitate translating
3197 # such short hashes, we have a mapping of prefixes to full old hashes.
3198 self._commit_short_old_hashes = collections.defaultdict(set)
3199
3200 # A set of commit hash references appearing in commit messages which
3201 # mapped to a valid commit that was removed entirely in the filtering
3202 # process. The commit message will continue to reference the
3203 # now-missing commit hash, since there was nothing to map it to.
3204 self._commits_referenced_but_removed = set()
3205
3206 # Other vars related to metadata tracking
3207 self._already_ran = False
3208 self._changed_refs = set()
3209 self._lfs_object_tracker = None
3210
3211 # Progress handling (number of commits parsed, etc.)
3212 self._progress_writer = ProgressWriter()
3213 self._num_commits = 0
3214
3215 # Size of blobs in the repo
3216 self._unpacked_size = {}
3217
3218 # Other vars
3219 self._sanity_checks_handled = False
3220 self._finalize_handled = False
3221 self._orig_refs = None
3222 self._config_settings = {}
3223 self._newnames = {}
3224 self._stash = None
3225
3226 # Cache a few message translations for performance reasons
3227 self._parsed_message = _("Parsed %d commits")
3228
3229 # Compile some regexes and cache those
3230 self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)')
3231
3232 def _handle_arg_callbacks(self):
3233 def make_callback(args, bdy):
3234 callback_globals = {g: globals()[g] for g in public_globals}
3235 callback_locals = {}
3236 if type(args) == str:
3237 args = (args, '_do_not_use_this_var = None')
3238 exec('def callback({}):\n'.format(', '.join(args))+
3239 ' '+'\n '.join(bdy.splitlines()), callback_globals, callback_locals)
3240 return callback_locals['callback']
3241 def handle(which, args=None):
3242 which_under = which.replace('-','_')
3243 if not args:
3244 args = which
3245 callback_field = '_{}_callback'.format(which_under)
3246 code_string = getattr(self._args, which_under+'_callback')
3247 if code_string:
3248 if os.path.exists(code_string):
3249 with open(code_string, 'r', encoding='utf-8') as f:
3250 code_string = f.read()
3251 if getattr(self, callback_field):
3252 raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter "
3253 "AND pass --%s-callback"
3254 % (which_under, which)))
3255 if 'return ' not in code_string and \
3256 which not in ('blob', 'commit', 'tag', 'reset'):
3257 raise SystemExit(_("Error: --%s-callback should have a return statement")
3258 % which)
3259 setattr(self, callback_field, make_callback(args, code_string))
3260 handle('filename')
3261 handle('message')
3262 handle('name')
3263 handle('email')
3264 handle('refname')
3265 handle('blob')
3266 handle('commit')
3267 handle('tag')
3268 handle('reset')
3269 handle('file-info', ('filename', 'mode', 'blob_id', 'value'))
3270
3271 def _run_sanity_checks(self):
3272 self._sanity_checks_handled = True
3273 if not self._managed_output:
3274 if not self._args.replace_refs:
3275 # If not _managed_output we don't want to make extra changes to the
3276 # repo, so set default to no-op 'update-no-add'
3277 self._args.replace_refs = 'update-no-add'
3278 return
3279
3280 if self._args.debug:
3281 print("[DEBUG] Passed arguments:\n{}".format(self._args))
3282
3283 # Determine basic repository information
3284 target_working_dir = self._args.target or b'.'
3285 self._orig_refs = GitUtils.get_refs(target_working_dir)
3286 is_bare = GitUtils.is_repository_bare(target_working_dir)
3287 self._config_settings = GitUtils.get_config_settings(target_working_dir)
3288
3289 # Determine if this is second or later run of filter-repo
3290 tmp_dir = self.results_tmp_dir(create_if_missing=False)
3291 ran_path = os.path.join(tmp_dir, b'already_ran')
3292 self._already_ran = os.path.isfile(ran_path)
3293 if self._already_ran:
3294 current_time = time.time()
3295 file_mod_time = os.path.getmtime(ran_path)
3296 file_age = current_time - file_mod_time
3297 if file_age > 86400: # file older than a day
3298 msg = (f"The previous run is older than a day ({decode(ran_path)} already exists).\n"
3299 f"See \"Already Ran\" section in the manual for more information.\n"
3300 f"Treat this run as a continuation of filtering in the previous run (Y/N)? ")
3301 response = input(msg)
3302
3303 if response.lower() != 'y':
3304 os.remove(ran_path)
3305 self._already_ran = False
3306
3307 # Interaction between --already-ran and --sensitive_data_removal
3308 msg = textwrap.dedent(_("""\
3309 Error: Cannot specify --sensitive-data-removal on a follow-up invocation
3310 of git-filter-repo unless it was specified in previously runs."""))
3311 if self._already_ran:
3312 sdr_path = os.path.join(tmp_dir, b'sensitive_data_removal')
3313 sdr_previously = os.path.isfile(sdr_path)
3314 if not sdr_previously and self._args.sensitive_data_removal:
3315 raise SystemExit(msg)
3316 # Treat this as a --sensitive-data-removal run if a previous run was,
3317 # even if it wasn't specified this time
3318 self._args.sensitive_data_removal = sdr_previously
3319
3320 # Have to check sensitive_data_removal interactions here instead of
3321 # sanity_check_args because of the above interaction with already_ran stuff
3322 if self._args.sensitive_data_removal:
3323 if self._args.stdin:
3324 msg = _("Error: sensitive data removal is incompatible with --stdin")
3325 raise SystemExit(msg)
3326 if self._args.source or self._args.target:
3327 msg = _("Error: sensitive data removal is incompatible with --source and --target")
3328 raise SystemExit(msg)
3329
3330 # Default for --replace-refs
3331 if not self._args.replace_refs:
3332 self._args.replace_refs = 'delete-no-add'
3333 if self._args.replace_refs == 'old-default':
3334 self._args.replace_refs = ('update-or-add' if self._already_ran
3335 else 'update-and-add')
3336
3337 # Do sanity checks from the correct directory
3338 if not self._args.force and not self._already_ran:
3339 cwd = os.getcwd()
3340 os.chdir(target_working_dir)
3341 RepoFilter.sanity_check(self._orig_refs, is_bare, self._config_settings)
3342 os.chdir(cwd)
3343
3344 def _setup_lfs_orphaning_checks(self):
3345 # Do a couple checks to see if we want to do lfs orphaning checks
3346 if not self._args.sensitive_data_removal:
3347 return
3348 metadata_dir = self.results_tmp_dir()
3349 lfs_objects_file = os.path.join(metadata_dir, b'original_lfs_objects')
3350 if self._already_ran:
3351 # Check if we did lfs filtering in the previous run
3352 if not os.path.isfile(lfs_objects_file):
3353 return
3354
3355 # Set up self._file_info_value so we can query git for stuff
3356 source_working_dir = self._args.source or b'.'
3357 self._file_info_value = FileInfoValueHelper(self._args.replace_text,
3358 self.insert,
3359 source_working_dir)
3360
3361 # One more check to see if we want to do lfs orphaning checks
3362 if not self._already_ran:
3363 # Check if lfs filtering is active in HEAD's .gitattributes file
3364 a = self._file_info_value.get_contents_by_identifier(b"HEAD:.gitattributes")
3365 if not a or not re.search(rb'\bfilter=lfs\b', a):
3366 return
3367
3368 # Set up the object tracker
3369 check_sources = not self._already_ran and not self._args.partial
3370 check_targets = not self._args.partial
3371 self._lfs_object_tracker = LFSObjectTracker(self._file_info_value,
3372 check_sources,
3373 check_targets)
3374 self._parser._lfs_object_tracker = self._lfs_object_tracker # kinda gross
3375
3376 # Get initial objects
3377 if self._already_ran:
3378 with open(lfs_objects_file, 'br') as f:
3379 for line in f:
3380 self._lfs_object_tracker.source_objects.objects.add(line.strip())
3381 elif self._args.partial:
3382 source = True
3383 self._lfs_object_tracker.find_all_lfs_objects_in_repo(source_working_dir,
3384 source)
3385
3386 @staticmethod
3387 def loose_objects_are_replace_refs(git_dir, refs, num_loose_objects):
3388 replace_objects = set()
3389 for refname, rev in refs.items():
3390 if not refname.startswith(b'refs/replace/'):
3391 continue
3392 replace_objects.add(rev)
3393
3394 validobj_re = re.compile(rb'^[0-9a-f]{40}$')
3395 object_dir=os.path.join(git_dir, b'objects')
3396 for root, dirs, files in os.walk(object_dir):
3397 for filename in files:
3398 objname = os.path.basename(root)+filename
3399 if objname not in replace_objects and validobj_re.match(objname):
3400 return False
3401
3402 return True
3403
3404 @staticmethod
3405 def sanity_check(refs, is_bare, config_settings):
3406 def abort(reason):
3407 dirname = config_settings.get(b'remote.origin.url', b'')
3408 msg = ""
3409 if dirname and os.path.isdir(dirname):
3410 msg = _("Note: when cloning local repositories, you need to pass\n"
3411 " --no-local to git clone to avoid this issue.\n")
3412 raise SystemExit(
3413 _("Aborting: Refusing to destructively overwrite repo history since\n"
3414 "this does not look like a fresh clone.\n"
3415 " (%s)\n%s"
3416 "Please operate on a fresh clone instead. If you want to proceed\n"
3417 "anyway, use --force.") % (reason, msg))
3418
3419 # Avoid letting people running with weird setups and overwriting GIT_DIR
3420 # elsewhere
3421 git_dir = GitUtils.determine_git_dir(b'.')
3422 if is_bare and git_dir != b'.':
3423 abort(_("GIT_DIR must be ."))
3424 elif not is_bare and git_dir != b'.git':
3425 abort(_("GIT_DIR must be .git"))
3426
3427 # Check for refname collisions
3428 if config_settings.get(b'core.ignorecase', b'false') == b'true':
3429 collisions = collections.defaultdict(list)
3430 for ref in refs:
3431 collisions[ref.lower()].append(ref)
3432 msg = ""
3433 for ref in collisions:
3434 if len(collisions[ref]) >= 2:
3435 msg += " " + decode(b", ".join(collisions[ref])) + "\n"
3436 if msg:
3437 raise SystemExit(
3438 _("Aborting: Cannot rewrite history on a case insensitive\n"
3439 "filesystem since you have refs that differ in case only:\n"
3440 "%s") % msg)
3441 if config_settings.get(b'core.precomposeunicode', b'false') == b'true':
3442 import unicodedata # Mac users need to have python-3.8
3443 collisions = collections.defaultdict(list)
3444 for ref in refs:
3445 strref = decode(ref)
3446 collisions[unicodedata.normalize('NFC', strref)].append(strref)
3447 msg = ""
3448 for ref in collisions:
3449 if len(collisions[ref]) >= 2:
3450 msg += " " + ", ".join(collisions[ref]) + "\n"
3451 if msg:
3452 raise SystemExit(
3453 _("Aborting: Cannot rewrite history on a character normalizing\n"
3454 "filesystem since you have refs that differ in normalization:\n"
3455 "%s") % msg)
3456
3457 # Make sure repo is fully packed, just like a fresh clone would be.
3458 # Note that transfer.unpackLimit defaults to 100, meaning that a
3459 # repository with no packs and less than 100 objects should be considered
3460 # fully packed.
3461 output = subproc.check_output('git count-objects -v'.split())
3462 stats = dict(x.split(b': ') for x in output.splitlines())
3463 num_packs = int(stats[b'packs'])
3464 num_loose_objects = int(stats[b'count'])
3465 if num_packs > 1 or \
3466 num_loose_objects >= 100 or \
3467 (num_packs == 1 and num_loose_objects > 0 and
3468 not RepoFilter.loose_objects_are_replace_refs(git_dir, refs,
3469 num_loose_objects)):
3470 abort(_("expected freshly packed repo"))
3471
3472 # Make sure there is precisely one remote, named "origin"...or that this
3473 # is a new bare repo with no packs and no remotes
3474 output = subproc.check_output('git remote'.split()).strip()
3475 if not (output == b"origin" or (num_packs == 0 and not output)):
3476 abort(_("expected one remote, origin"))
3477
3478 # Make sure that all reflogs have precisely one entry
3479 reflog_dir=os.path.join(git_dir, b'logs')
3480 for root, dirs, files in os.walk(reflog_dir):
3481 for filename in files:
3482 pathname = os.path.join(root, filename)
3483 with open(pathname, 'br') as f:
3484 if len(f.read().splitlines()) > 1:
3485 shortpath = pathname[len(reflog_dir)+1:]
3486 abort(_("expected at most one entry in the reflog for %s") %
3487 decode(shortpath))
3488
3489 # Make sure there are no stashed changes
3490 if b'refs/stash' in refs:
3491 abort(_("has stashed changes"))
3492
3493 # Do extra checks in non-bare repos
3494 if not is_bare:
3495 # Avoid uncommitted, unstaged, or untracked changes
3496 if subproc.call('git diff --staged --quiet'.split()):
3497 abort(_("you have uncommitted changes"))
3498 if subproc.call('git diff --quiet'.split()):
3499 abort(_("you have unstaged changes"))
3500 untracked_output = subproc.check_output('git ls-files -o'.split())
3501 if len(untracked_output) > 0:
3502 uf = untracked_output.rstrip(b'\n').split(b'\n')
3503 # Since running git-filter-repo can result in files being written to
3504 # __pycache__ (depending on python version, env vars, etc.), let's
3505 # ignore those as far as "clean clone" is concerned.
3506 relevant_uf = [x for x in uf
3507 if not x.startswith(b'__pycache__/git_filter_repo.')]
3508 if len(relevant_uf) > 0:
3509 abort(_("you have untracked changes"))
3510
3511 # Avoid unpushed changes
3512 for refname, rev in refs.items():
3513 if not refname.startswith(b'refs/heads/'):
3514 continue
3515 origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/')
3516 if origin_ref not in refs:
3517 abort(_('%s exists, but %s not found') % (decode(refname),
3518 decode(origin_ref)))
3519 if rev != refs[origin_ref]:
3520 abort(_('%s does not match %s') % (decode(refname),
3521 decode(origin_ref)))
3522
3523 # Make sure there is only one worktree
3524 output = subproc.check_output('git worktree list'.split())
3525 if len(output.splitlines()) > 1:
3526 abort(_('you have multiple worktrees'))
3527
3528 def cleanup(self, repo, repack, reset,
3529 run_quietly=False, show_debuginfo=False):
3530 ''' cleanup repo; if repack then expire reflogs and do a gc --prune=now.
3531 if reset then do a reset --hard. Optionally also curb output if
3532 run_quietly is True, or go the opposite direction and show extra
3533 output if show_debuginfo is True. '''
3534 assert not (run_quietly and show_debuginfo)
3535
3536 if (repack and not run_quietly and not show_debuginfo):
3537 print(_("Repacking your repo and cleaning out old unneeded objects"))
3538 quiet_flags = '--quiet' if run_quietly else ''
3539 cleanup_cmds = []
3540 if repack:
3541 cleanup_cmds = ['git reflog expire --expire=now --all'.split(),
3542 'git gc {} --prune=now'.format(quiet_flags).split()]
3543 if reset:
3544 cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split())
3545 location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else ''
3546 for cmd in cleanup_cmds:
3547 if show_debuginfo:
3548 print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd)))
3549 ret = subproc.call(cmd, cwd=repo)
3550 if ret != 0:
3551 raise SystemExit("fatal: running '%s' failed!" % ' '.join(cmd))
3552 if cmd[0:3] == 'git reflog expire'.split():
3553 self._write_stash()
3554
3555 def _get_rename(self, old_hash):
3556 # If we already know the rename, just return it
3557 new_hash = self._commit_renames.get(old_hash, None)
3558 if new_hash:
3559 return new_hash
3560
3561 # If it's not in the remaining pending renames, we don't know it
3562 if old_hash is not None and old_hash not in self._pending_renames:
3563 return None
3564
3565 # Read through the pending renames until we find it or we've read them all,
3566 # and return whatever we might find
3567 self._flush_renames(old_hash)
3568 return self._commit_renames.get(old_hash, None)
3569
3570 def _flush_renames(self, old_hash=None, limit=0):
3571 # Parse through self._pending_renames until we have read enough. We have
3572 # read enough if:
3573 # self._pending_renames is empty
3574 # old_hash != None and we found a rename for old_hash
3575 # limit > 0 and len(self._pending_renames) started less than 2*limit
3576 # limit > 0 and len(self._pending_renames) < limit
3577 if limit and len(self._pending_renames) < 2 * limit:
3578 return
3579 fi_input, fi_output = self._import_pipes
3580 while self._pending_renames:
3581 orig_hash, new_fast_export_id = self._pending_renames.popitem(last=False)
3582 new_hash = fi_output.readline().rstrip()
3583 self._commit_renames[orig_hash] = new_hash
3584 self._graph.record_hash(new_fast_export_id, new_hash)
3585 if old_hash == orig_hash:
3586 return
3587 if limit and len(self._pending_renames) < limit:
3588 return
3589
3590 def _translate_commit_hash(self, matchobj_or_oldhash):
3591 old_hash = matchobj_or_oldhash
3592 if not isinstance(matchobj_or_oldhash, bytes):
3593 old_hash = matchobj_or_oldhash.group(1)
3594 orig_len = len(old_hash)
3595 new_hash = self._get_rename(old_hash)
3596 if new_hash is None:
3597 if old_hash[0:7] not in self._commit_short_old_hashes:
3598 self._commits_referenced_but_removed.add(old_hash)
3599 return old_hash
3600 possibilities = self._commit_short_old_hashes[old_hash[0:7]]
3601 matches = [x for x in possibilities
3602 if x[0:orig_len] == old_hash]
3603 if len(matches) != 1:
3604 self._commits_referenced_but_removed.add(old_hash)
3605 return old_hash
3606 old_hash = matches[0]
3607 new_hash = self._get_rename(old_hash)
3608
3609 assert new_hash is not None
3610 return new_hash[0:orig_len]
3611
3612 def _maybe_trim_extra_parents(self, orig_parents, parents):
3613 '''Due to pruning of empty commits, some parents could be non-existent
3614 (None) or otherwise redundant. Remove the non-existent parents, and
3615 remove redundant parents ***SO LONG AS*** that doesn't transform a
3616 merge commit into a non-merge commit.
3617
3618 Returns a tuple:
3619 (parents, new_first_parent_if_would_become_non_merge)'''
3620
3621 always_prune = (self._args.prune_degenerate == 'always')
3622
3623 # Pruning of empty commits means multiple things:
3624 # * An original parent of this commit may have been pruned causing the
3625 # need to rewrite the reported parent to the nearest ancestor. We
3626 # want to know when we're dealing with such a parent.
3627 # * Further, there may be no "nearest ancestor" if the entire history
3628 # of that parent was also pruned. (Detectable by the parent being
3629 # 'None')
3630 # Remove all parents rewritten to None, and keep track of which parents
3631 # were rewritten to an ancestor.
3632 tmp = zip(parents,
3633 orig_parents,
3634 [(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents])
3635 tmp2 = [x for x in tmp if x[0] is not None]
3636 if not tmp2:
3637 # All ancestors have been pruned; we have no parents.
3638 return [], None
3639 parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)]
3640
3641 # We can't have redundant parents if we don't have at least 2 parents
3642 if len(parents) < 2:
3643 return parents, None
3644
3645 # Don't remove redundant parents if user doesn't want us to
3646 if self._args.prune_degenerate == 'never':
3647 return parents, None
3648
3649 # Remove duplicate parents (if both sides of history have lots of commits
3650 # which become empty due to pruning, the most recent ancestor on both
3651 # sides may be the same commit), except only remove parents that have
3652 # been rewritten due to previous empty pruning.
3653 seen = set()
3654 seen_add = seen.add
3655 # Deleting duplicate rewritten parents means keeping parents if either
3656 # they have not been seen or they are ones that have not been rewritten.
3657 parents_copy = parents
3658 uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents)
3659 if not (p in seen or seen_add(p)) or not is_rewritten[i]]
3660 parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)]
3661 if len(parents) < 2:
3662 return parents_copy, parents[0]
3663
3664 # Flatten unnecessary merges. (If one side of history is entirely
3665 # empty commits that were pruned, we may end up attempting to
3666 # merge a commit with its ancestor. Remove parents that are an
3667 # ancestor of another parent.)
3668 num_parents = len(parents)
3669 to_remove = []
3670 for cur in range(num_parents):
3671 if not is_rewritten[cur]:
3672 continue
3673 for other in range(num_parents):
3674 if cur == other:
3675 continue
3676 if not self._graph.is_ancestor(parents[cur], parents[other]):
3677 continue
3678 # parents[cur] is an ancestor of parents[other], so parents[cur]
3679 # seems redundant. However, if it was intentionally redundant
3680 # (e.g. a no-ff merge) in the original, then we want to keep it.
3681 if not always_prune and \
3682 self._orig_graph.is_ancestor(orig_parents[cur],
3683 orig_parents[other]):
3684 continue
3685 # Some folks want their history to have all first parents be merge
3686 # commits (except for any root commits), and always do a merge --no-ff.
3687 # For such folks, don't remove the first parent even if it's an
3688 # ancestor of other commits.
3689 if self._args.no_ff and cur == 0:
3690 continue
3691 # Okay so the cur-th parent is an ancestor of the other-th parent,
3692 # and it wasn't that way in the original repository; mark the
3693 # cur-th parent as removable.
3694 to_remove.append(cur)
3695 break # cur removed, so skip rest of others -- i.e. check cur+=1
3696 for x in reversed(to_remove):
3697 parents.pop(x)
3698 if len(parents) < 2:
3699 return parents_copy, parents[0]
3700
3701 return parents, None
3702
3703 def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents):
3704 parents = commit.parents
3705
3706 if self._args.prune_empty == 'never':
3707 return False
3708 always_prune = (self._args.prune_empty == 'always')
3709
3710 # For merge commits, unless there are prunable (redundant) parents, we
3711 # do not want to prune
3712 if len(parents) >= 2 and not new_1st_parent:
3713 return False
3714
3715 if len(parents) < 2:
3716 # Special logic for commits that started empty...
3717 if not had_file_changes and not always_prune:
3718 had_parents_pruned = (len(parents) < len(orig_parents) or
3719 (len(orig_parents) == 1 and
3720 orig_parents[0] in _SKIPPED_COMMITS))
3721 # If the commit remains empty and had parents which were pruned,
3722 # then prune this commit; otherwise, retain it
3723 return (not commit.file_changes and had_parents_pruned)
3724
3725 # We can only get here if the commit didn't start empty, so if it's
3726 # empty now, it obviously became empty
3727 if not commit.file_changes:
3728 return True
3729
3730 # If there are no parents of this commit and we didn't match the case
3731 # above, then this commit cannot be pruned. Since we have no parent(s)
3732 # to compare to, abort now to prevent future checks from failing.
3733 if not parents:
3734 return False
3735
3736 # Similarly, we cannot handle the hard cases if we don't have a pipe
3737 # to communicate with fast-import
3738 if not self._import_pipes:
3739 return False
3740
3741 # If there have not been renames/remappings of IDs (due to insertion of
3742 # new blobs), then we can sometimes know things aren't prunable with a
3743 # simple check
3744 if not _IDS.has_renames():
3745 # non-merge commits can only be empty if blob/file-change editing caused
3746 # all file changes in the commit to have the same file contents as
3747 # the parent.
3748 changed_files = set(change.filename for change in commit.file_changes)
3749 if len(orig_parents) < 2 and changed_files - self._files_tweaked:
3750 return False
3751
3752 # Finally, the hard case: due to either blob rewriting, or due to pruning
3753 # of empty commits wiping out the first parent history back to the merge
3754 # base, the list of file_changes we have may not actually differ from our
3755 # (new) first parent's version of the files, i.e. this would actually be
3756 # an empty commit. Check by comparing the contents of this commit to its
3757 # (remaining) parent.
3758 #
3759 # NOTE on why this works, for the case of original first parent history
3760 # having been pruned away due to being empty:
3761 # The first parent history having been pruned away due to being
3762 # empty implies the original first parent would have a tree (after
3763 # filtering) that matched the merge base's tree. Since
3764 # file_changes has the changes needed to go from what would have
3765 # been the first parent to our new commit, and what would have been
3766 # our first parent has a tree that matches the merge base, then if
3767 # the new first parent has a tree matching the versions of files in
3768 # file_changes, then this new commit is empty and thus prunable.
3769 fi_input, fi_output = self._import_pipes
3770 self._flush_renames() # Avoid fi_output having other stuff present
3771 # Optimization note: we could have two loops over file_changes, the
3772 # first doing all the self._output.write() calls, and the second doing
3773 # the rest. But I'm worried about fast-import blocking on fi_output
3774 # buffers filling up so I instead read from it as I go.
3775 for change in commit.file_changes:
3776 parent = new_1st_parent or commit.parents[0] # exists due to above checks
3777 quoted_filename = PathQuoting.enquote(change.filename)
3778 if isinstance(parent, int):
3779 self._output.write(b"ls :%d %s\n" % (parent, quoted_filename))
3780 else:
3781 self._output.write(b"ls %s %s\n" % (parent, quoted_filename))
3782 self._output.flush()
3783 parent_version = fi_output.readline().split()
3784 if change.type == b'D':
3785 if parent_version != [b'missing', quoted_filename]:
3786 return False
3787 else:
3788 blob_sha = change.blob_id
3789 if isinstance(change.blob_id, int):
3790 self._output.write(b"get-mark :%d\n" % change.blob_id)
3791 self._output.flush()
3792 blob_sha = fi_output.readline().rstrip()
3793 if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]:
3794 return False
3795
3796 return True
3797
3798 def _record_remapping(self, commit, orig_parents):
3799 new_id = None
3800 # Record the mapping of old commit hash to new one
3801 if commit.original_id and self._import_pipes:
3802 fi_input, fi_output = self._import_pipes
3803 self._output.write(b"get-mark :%d\n" % commit.id)
3804 self._output.flush()
3805 orig_id = commit.original_id
3806 self._commit_short_old_hashes[orig_id[0:7]].add(orig_id)
3807 # Note that we have queued up an id for later reading; flush a
3808 # few of the older ones if we have too many queued up
3809 self._pending_renames[orig_id] = commit.id
3810 self._flush_renames(None, limit=40)
3811 # Also, record if this was a merge commit that turned into a non-merge
3812 # commit.
3813 if len(orig_parents) >= 2 and len(commit.parents) < 2:
3814 self._commits_no_longer_merges.append((commit.original_id, new_id))
3815
3816 def callback_metadata(self, extra_items = dict()):
3817 return {'commit_rename_func': self._translate_commit_hash,
3818 'ancestry_graph': self._graph,
3819 'original_ancestry_graph': self._orig_graph,
3820 **extra_items}
3821
3822 def _tweak_blob(self, blob):
3823 if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size:
3824 blob.skip()
3825
3826 if blob.original_id in self._args.strip_blobs_with_ids:
3827 blob.skip()
3828
3829 if ( self._args.replace_text
3830 and not self._file_info_callback
3831 # not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data)
3832 and not b"\0" in blob.data[0:8192]
3833 ):
3834 for literal, replacement in self._args.replace_text['literals']:
3835 blob.data = blob.data.replace(literal, replacement)
3836 for regex, replacement in self._args.replace_text['regexes']:
3837 blob.data = regex.sub(replacement, blob.data)
3838
3839 if self._blob_callback:
3840 self._blob_callback(blob, self.callback_metadata())
3841
3842 self._insert_into_stream(blob)
3843
3844 def _filter_files(self, commit):
3845 def filename_matches(path_expression, pathname):
3846 ''' Returns whether path_expression matches pathname or a leading
3847 directory thereof, allowing path_expression to not have a trailing
3848 slash even if it is meant to match a leading directory. '''
3849 if path_expression == b'':
3850 return True
3851 n = len(path_expression)
3852 if (pathname.startswith(path_expression) and
3853 (path_expression[n-1:n] == b'/' or
3854 len(pathname) == n or
3855 pathname[n:n+1] == b'/')):
3856 return True
3857 return False
3858
3859 def newname(path_changes, pathname, use_base_name, filtering_is_inclusive):
3860 ''' Applies filtering and rename changes from path_changes to pathname,
3861 returning any of None (file isn't wanted), original filename (file
3862 is wanted with original name), or new filename. '''
3863 wanted = False
3864 full_pathname = pathname
3865 if use_base_name:
3866 pathname = os.path.basename(pathname)
3867 for (mod_type, match_type, path_exp) in path_changes:
3868 if mod_type == 'filter' and not wanted:
3869 assert match_type in ('match', 'glob', 'regex')
3870 if match_type == 'match' and filename_matches(path_exp, pathname):
3871 wanted = True
3872 if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp):
3873 wanted = True
3874 if match_type == 'regex' and path_exp.search(pathname):
3875 wanted = True
3876 elif mod_type == 'rename':
3877 match, repl = path_exp
3878 assert match_type in ('match','regex') # glob was translated to regex
3879 if match_type == 'match' and filename_matches(match, full_pathname):
3880 full_pathname = full_pathname.replace(match, repl, 1)
3881 pathname = full_pathname # rename incompatible with use_base_name
3882 if match_type == 'regex':
3883 full_pathname = match.sub(repl, full_pathname)
3884 pathname = full_pathname # rename incompatible with use_base_name
3885 return full_pathname if (wanted == filtering_is_inclusive) else None
3886
3887 args = self._args
3888 new_file_changes = {} # Assumes no renames or copies, otherwise collisions
3889 for change in commit.file_changes:
3890 # NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and
3891 # parse that output, we'll need to modify this block; `--full-tree`
3892 # issues a deleteall directive which has no filename, and thus this
3893 # block would normally strip it. Of course, FileChange() and
3894 # _parse_optional_filechange() would need updates too.
3895 if change.type == b'DELETEALL':
3896 new_file_changes[b''] = change
3897 continue
3898 if change.filename in self._newnames:
3899 change.filename = self._newnames[change.filename]
3900 else:
3901 original_filename = change.filename
3902 change.filename = newname(args.path_changes, change.filename,
3903 args.use_base_name, args.inclusive)
3904 if self._filename_callback:
3905 change.filename = self._filename_callback(change.filename)
3906 self._newnames[original_filename] = change.filename
3907 if not change.filename:
3908 continue # Filtering criteria excluded this file; move on to next one
3909 if change.filename in new_file_changes:
3910 # Getting here means that path renaming is in effect, and caused one
3911 # path to collide with another. That's usually bad, but can be okay
3912 # under two circumstances:
3913 # 1) Sometimes people have a file named OLDFILE in old revisions of
3914 # history, and they rename to NEWFILE, and would like to rewrite
3915 # history so that all revisions refer to it as NEWFILE. As such,
3916 # we can allow a collision when (at least) one of the two paths
3917 # is a deletion. Note that if OLDFILE and NEWFILE are unrelated
3918 # this also allows the rewrite to continue, which makes sense
3919 # since OLDFILE is no longer in the way.
3920 # 2) If OLDFILE and NEWFILE are exactly equal, then writing them
3921 # both to the same location poses no problem; we only need one
3922 # file. (This could come up if someone copied a file in some
3923 # commit, then later either deleted the file or kept it exactly
3924 # in sync with the original with any changes, and then decides
3925 # they want to rewrite history to only have one of the two files)
3926 colliding_change = new_file_changes[change.filename]
3927 if change.type == b'D':
3928 # We can just throw this one away and keep the other
3929 continue
3930 elif change.type == b'M' and (
3931 change.mode == colliding_change.mode and
3932 change.blob_id == colliding_change.blob_id):
3933 # The two are identical, so we can throw this one away and keep other
3934 continue
3935 elif new_file_changes[change.filename].type != b'D':
3936 raise SystemExit(_("File renaming caused colliding pathnames!\n") +
3937 _(" Commit: {}\n").format(commit.original_id) +
3938 _(" Filename: {}").format(change.filename))
3939 # Strip files that are too large
3940 if self._args.max_blob_size and \
3941 self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size:
3942 continue
3943 if self._args.strip_blobs_with_ids and \
3944 change.blob_id in self._args.strip_blobs_with_ids:
3945 continue
3946 # Otherwise, record the change
3947 new_file_changes[change.filename] = change
3948 commit.file_changes = [v for k,v in sorted(new_file_changes.items())]
3949
3950 def _tweak_commit(self, commit, aux_info):
3951 if self._args.replace_message:
3952 for literal, replacement in self._args.replace_message['literals']:
3953 commit.message = commit.message.replace(literal, replacement)
3954 for regex, replacement in self._args.replace_message['regexes']:
3955 commit.message = regex.sub(replacement, commit.message)
3956 if self._message_callback:
3957 commit.message = self._message_callback(commit.message)
3958
3959 # Change the commit message according to callback
3960 if not self._args.preserve_commit_hashes:
3961 commit.message = self._hash_re.sub(self._translate_commit_hash,
3962 commit.message)
3963
3964 # Change the author & committer according to mailmap rules
3965 args = self._args
3966 if args.mailmap:
3967 commit.author_name, commit.author_email = \
3968 args.mailmap.translate(commit.author_name, commit.author_email)
3969 commit.committer_name, commit.committer_email = \
3970 args.mailmap.translate(commit.committer_name, commit.committer_email)
3971 # Change author & committer according to callbacks
3972 if self._name_callback:
3973 commit.author_name = self._name_callback(commit.author_name)
3974 commit.committer_name = self._name_callback(commit.committer_name)
3975 if self._email_callback:
3976 commit.author_email = self._email_callback(commit.author_email)
3977 commit.committer_email = self._email_callback(commit.committer_email)
3978
3979 # Sometimes the 'branch' given is a tag; if so, rename it as requested so
3980 # we don't get any old tagnames
3981 if self._args.tag_rename:
3982 commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch)
3983 if self._refname_callback:
3984 commit.branch = self._refname_callback(commit.branch)
3985
3986 # Filter or rename the list of file changes
3987 orig_file_changes = set(commit.file_changes)
3988 self._filter_files(commit)
3989
3990 # Record ancestry graph
3991 parents, orig_parents = commit.parents, aux_info['orig_parents']
3992 if self._args.state_branch:
3993 external_parents = parents
3994 else:
3995 external_parents = [p for p in parents if not isinstance(p, int)]
3996 # The use of 'reversed' is intentional here; there is a risk that we have
3997 # duplicates in parents, and we want to map from parents to the first
3998 # entry we find in orig_parents in such cases.
3999 parent_reverse_dict = dict(zip(reversed(parents), reversed(orig_parents)))
4000
4001 self._graph.record_external_commits(external_parents)
4002 self._orig_graph.record_external_commits(external_parents)
4003 self._graph.add_commit_and_parents(commit.id, parents) # new githash unknown
4004 self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents,
4005 commit.original_id)
4006
4007 # Prune parents (due to pruning of empty commits) if relevant, note that
4008 # new_1st_parent is None unless this was a merge commit that is becoming
4009 # a non-merge
4010 prev_1st_parent = parents[0] if parents else None
4011 parents, new_1st_parent = self._maybe_trim_extra_parents(orig_parents,
4012 parents)
4013 commit.parents = parents
4014
4015 # If parents were pruned, then we need our file changes to be relative
4016 # to the new first parent
4017 #
4018 # Notes:
4019 # * new_1st_parent and new_1st_parent != parents[0] uniquely happens for example when:
4020 # working on merge, selecting subset of files and merge base still
4021 # valid while first parent history doesn't touch any of those paths,
4022 # but second parent history does. prev_1st_parent had already been
4023 # rewritten to the non-None first ancestor and it remains valid.
4024 # self._maybe_trim_extra_parents() avoids removing this first parent
4025 # because it'd make the commit a non-merge. However, if there are
4026 # no file_changes of note, we'll drop this commit and mark
4027 # new_1st_parent as the new replacement. To correctly determine if
4028 # there are no file_changes of note, we need to have the list of
4029 # file_changes relative to new_1st_parent.
4030 # (See t9390#3, "basic -> basic-ten using '--path ten'")
4031 # * prev_1st_parent != parents[0] happens for example when:
4032 # similar to above, but the merge base is no longer valid and was
4033 # pruned away as well. Then parents started as e.g. [None, $num],
4034 # and both prev_1st_parent and new_1st_parent are None, while parents
4035 # after self._maybe_trim_extra_parents() becomes just [$num].
4036 # (See t9390#67, "degenerate merge with non-matching filename".)
4037 # Since $num was originally a second parent, we need to rewrite
4038 # file changes to be relative to parents[0].
4039 # * TODO: We should be getting the changes relative to the new first
4040 # parent even if self._fep is None, BUT we can't. Our method of
4041 # getting the changes right now is an external git diff invocation,
4042 # which we can't do if we just have a fast export stream. We can't
4043 # really work around it by querying the fast-import stream either,
4044 # because the 'ls' directive only allows us to list info about
4045 # specific paths, but we need to find out which paths exist in two
4046 # commits and then query them. We could maybe force checkpointing in
4047 # fast-import, then doing a diff from what'll be the new first parent
4048 # back to prev_1st_parent (which may be None, i.e. empty tree), using
4049 # the fact that in A->{B,C}->D, where D is merge of B & C, the diff
4050 # from C->D == C->A + A->B + B->D, and in these cases A==B, so it
4051 # simplifies to C->D == C->A + B->D, and C is our new 1st parent
4052 # commit, A is prev_1st_commit, and B->D is commit.file_changes that
4053 # we already have. However, checkpointing the fast-import process
4054 # and figuring out how long to wait before we can run our diff just
4055 # seems excessive. For now, just punt and assume the merge wasn't
4056 # "evil" (i.e. that it's remerge-diff is empty, as is true for most
4057 # merges). If the merge isn't evil, no further steps are necessary.
4058 if parents and self._fep and (
4059 prev_1st_parent != parents[0] or
4060 new_1st_parent and new_1st_parent != parents[0]):
4061 # Get the id from the original fast export stream corresponding to the
4062 # new 1st parent. As noted above, that new 1st parent might be
4063 # new_1st_parent, or if that is None, it'll be parents[0].
4064 will_be_1st = new_1st_parent or parents[0]
4065 old_id = parent_reverse_dict[will_be_1st]
4066 # Now, translate that to a hash
4067 will_be_1st_commit_hash = self._orig_graph.map_to_hash(old_id)
4068 # Get the changes from what is going to be the new 1st parent to this
4069 # merge commit. Note that since we are going from the new 1st parent
4070 # to the merge commit, we can just replace the existing
4071 # commit.file_changes rather than getting something we need to combine
4072 # with the existing commit.file_changes. Also, we can just replace
4073 # because prev_1st_parent is an ancestor of will_be_1st_commit_hash
4074 # (or prev_1st_parent is None and first parent history is gone), so
4075 # even if we retain prev_1st_parent and do not prune it, the changes
4076 # will still work given the snapshot-based way fast-export/fast-import
4077 # work.
4078 commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir,
4079 will_be_1st_commit_hash,
4080 commit.original_id)
4081
4082 # Save these and filter them
4083 orig_file_changes = set(commit.file_changes)
4084 self._filter_files(commit)
4085
4086 # Process the --file-info-callback
4087 if self._file_info_callback:
4088 if self._file_info_value is None:
4089 source_working_dir = self._args.source or b'.'
4090 self._file_info_value = FileInfoValueHelper(self._args.replace_text,
4091 self.insert,
4092 source_working_dir)
4093 new_file_changes = []
4094 for change in commit.file_changes:
4095 if change.type != b'D':
4096 assert(change.type == b'M')
4097 (filename, mode, blob_id) = \
4098 self._file_info_callback(change.filename,
4099 change.mode,
4100 change.blob_id,
4101 self._file_info_value)
4102 if mode is None:
4103 # TODO: Should deletion of the file even be a feature? Might
4104 # want to remove this branch of the if-elif-else.
4105 assert(filename is not None)
4106 assert(blob_id is not None)
4107 new_change = FileChange(b'D', filename)
4108 elif filename is None:
4109 continue # Drop the FileChange from this commit
4110 else:
4111 new_change = FileChange(b'M', filename, blob_id, mode)
4112 else:
4113 new_change = change # use change as-is for deletions
4114 new_file_changes.append(new_change)
4115 commit.file_changes = new_file_changes
4116
4117 # Call the user-defined callback, if any
4118 if self._commit_callback:
4119 self._commit_callback(commit, self.callback_metadata(aux_info))
4120
4121 # Find out which files were modified by the callbacks. Such paths could
4122 # lead to subsequent commits being empty (e.g. if removing a line containing
4123 # a password from every version of a file that had the password, and some
4124 # later commit did nothing more than remove that line)
4125 final_file_changes = set(commit.file_changes)
4126 if self._args.replace_text or self._blob_callback:
4127 differences = orig_file_changes.union(final_file_changes)
4128 else:
4129 differences = orig_file_changes.symmetric_difference(final_file_changes)
4130 self._files_tweaked.update(x.filename for x in differences)
4131
4132 # Now print the resulting commit, or if prunable skip it
4133 if not commit.dumped:
4134 if not self._prunable(commit, new_1st_parent,
4135 aux_info['had_file_changes'], orig_parents):
4136 self._insert_into_stream(commit)
4137 self._record_remapping(commit, orig_parents)
4138 else:
4139 rewrite_to = new_1st_parent or commit.first_parent()
4140 commit.skip(new_id = rewrite_to)
4141 if self._args.state_branch:
4142 alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash)
4143 self._insert_into_stream(alias)
4144 if commit.branch.startswith(b'refs/') or commit.branch == b'HEAD':
4145 # The special check above is because when direct revisions are passed
4146 # along to fast-export (such as with stashes), there is a chance the
4147 # revision is rewritten to nothing. In such cases, we don't want to
4148 # point an invalid ref that just names a revision to some other point.
4149 reset = Reset(commit.branch, rewrite_to or deleted_hash)
4150 self._insert_into_stream(reset)
4151 self._commit_renames[commit.original_id] = None
4152
4153 # Show progress
4154 self._num_commits += 1
4155 if not self._args.quiet:
4156 self._progress_writer.show(self._parsed_message % self._num_commits)
4157
4158 @staticmethod
4159 def _do_tag_rename(rename_pair, tagname):
4160 old, new = rename_pair.split(b':', 1)
4161 old, new = b'refs/tags/'+old, b'refs/tags/'+new
4162 if tagname.startswith(old):
4163 return tagname.replace(old, new, 1)
4164 return tagname
4165
4166 def _tweak_tag(self, tag):
4167 # Tweak the tag message according to callbacks
4168 if self._args.replace_message:
4169 for literal, replacement in self._args.replace_message['literals']:
4170 tag.message = tag.message.replace(literal, replacement)
4171 for regex, replacement in self._args.replace_message['regexes']:
4172 tag.message = regex.sub(replacement, tag.message)
4173 if self._message_callback:
4174 tag.message = self._message_callback(tag.message)
4175
4176 # Tweak the tag name according to tag-name-related callbacks
4177 tag_prefix = b'refs/tags/'
4178 fullref = tag_prefix+tag.ref
4179 if self._args.tag_rename:
4180 fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref)
4181 if self._refname_callback:
4182 fullref = self._refname_callback(fullref)
4183 if not fullref.startswith(tag_prefix):
4184 msg = "Error: fast-import requires tags to be in refs/tags/ namespace."
4185 msg += "\n {} renamed to {}".format(tag_prefix+tag.ref, fullref)
4186 raise SystemExit(msg)
4187 tag.ref = fullref[len(tag_prefix):]
4188
4189 # Tweak the tagger according to callbacks
4190 if self._args.mailmap:
4191 tag.tagger_name, tag.tagger_email = \
4192 self._args.mailmap.translate(tag.tagger_name, tag.tagger_email)
4193 if self._name_callback:
4194 tag.tagger_name = self._name_callback(tag.tagger_name)
4195 if self._email_callback:
4196 tag.tagger_email = self._email_callback(tag.tagger_email)
4197
4198 # Call general purpose tag callback
4199 if self._tag_callback:
4200 self._tag_callback(tag, self.callback_metadata())
4201
4202 def _tweak_reset(self, reset):
4203 if self._args.tag_rename:
4204 reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref)
4205 if self._refname_callback:
4206 reset.ref = self._refname_callback(reset.ref)
4207 if self._reset_callback:
4208 self._reset_callback(reset, self.callback_metadata())
4209
4210 def results_tmp_dir(self, create_if_missing=True):
4211 target_working_dir = self._args.target or b'.'
4212 git_dir = GitUtils.determine_git_dir(target_working_dir)
4213 d = os.path.join(git_dir, b'filter-repo')
4214 if create_if_missing and not os.path.isdir(d):
4215 os.mkdir(d)
4216 return d
4217
4218 def _load_marks_file(self, marks_basename):
4219 full_branch = 'refs/heads/{}'.format(self._args.state_branch)
4220 marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
4221 working_dir = self._args.target or b'.'
4222 cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
4223 contents = b''
4224 if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
4225 cmd = ['git', '-C', working_dir, 'show',
4226 '%s:%s' % (full_branch, decode(marks_basename))]
4227 try:
4228 contents = subproc.check_output(cmd)
4229 except subprocess.CalledProcessError as e: # pragma: no cover
4230 raise SystemExit(_("Failed loading %s from %s") %
4231 (decode(marks_basename), full_branch))
4232 if contents:
4233 biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines())
4234 _IDS._next_id = max(_IDS._next_id, biggest_id+1)
4235 with open(marks_file, 'bw') as f:
4236 f.write(contents)
4237 return marks_file
4238
4239 def _save_marks_files(self):
4240 basenames = [b'source-marks', b'target-marks']
4241 working_dir = self._args.target or b'.'
4242
4243 # Check whether the branch exists
4244 parent = []
4245 full_branch = 'refs/heads/{}'.format(self._args.state_branch)
4246 cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
4247 if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
4248 parent = ['-p', full_branch]
4249
4250 # Run 'git hash-object $MARKS_FILE' for each marks file, save result
4251 blob_hashes = {}
4252 for marks_basename in basenames:
4253 marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
4254 if not os.path.isfile(marks_file): # pragma: no cover
4255 raise SystemExit(_("Failed to find %s to save to %s")
4256 % (marks_file, self._args.state_branch))
4257 cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file]
4258 blob_hashes[marks_basename] = subproc.check_output(cmd).strip()
4259
4260 # Run 'git mktree' to create a tree out of it
4261 p = subproc.Popen(['git', '-C', working_dir, 'mktree'],
4262 stdin=subprocess.PIPE, stdout=subprocess.PIPE)
4263 for b in basenames:
4264 p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b))
4265 p.stdin.close()
4266 p.wait()
4267 tree = p.stdout.read().strip()
4268
4269 # Create the new commit
4270 cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files',
4271 tree] + parent)
4272 commit = subproc.check_output(cmd).strip()
4273 subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit])
4274
4275 def importer_only(self):
4276 self._run_sanity_checks()
4277 self._setup_output()
4278
4279 def set_output(self, outputRepoFilter):
4280 assert outputRepoFilter._output
4281
4282 # set_output implies this RepoFilter is doing exporting, though may not
4283 # be the only one.
4284 self._setup_input(use_done_feature = False)
4285
4286 # Set our output management up to pipe to outputRepoFilter's locations
4287 self._managed_output = False
4288 self._output = outputRepoFilter._output
4289 self._import_pipes = outputRepoFilter._import_pipes
4290
4291 # Handle sanity checks, though currently none needed for export-only cases
4292 self._run_sanity_checks()
4293
4294 def _read_stash(self):
4295 if self._stash:
4296 return
4297 if self._orig_refs and b'refs/stash' in self._orig_refs and \
4298 self._args.refs == ['--all']:
4299 repo_working_dir = self._args.source or b'.'
4300 git_dir = GitUtils.determine_git_dir(repo_working_dir)
4301 stash = os.path.join(git_dir, b'logs', b'refs', b'stash')
4302 if os.path.exists(stash):
4303 self._stash = []
4304 with open(stash, 'br') as f:
4305 for line in f:
4306 (oldhash, newhash, rest) = line.split(None, 2)
4307 self._stash.append((newhash, rest))
4308 self._args.refs.extend([x[0] for x in self._stash])
4309
4310 def _write_stash(self):
4311 last = deleted_hash
4312 if self._stash:
4313 target_working_dir = self._args.target or b'.'
4314 git_dir = GitUtils.determine_git_dir(target_working_dir)
4315 stash = os.path.join(git_dir, b'logs', b'refs', b'stash')
4316 with open(stash, 'bw') as f:
4317 for (hash, rest) in self._stash:
4318 new_hash = self._get_rename(hash)
4319 if new_hash is None:
4320 continue
4321 f.write(b' '.join([last, new_hash, rest]) + b'\n')
4322 last = new_hash
4323 print(_("Rewrote the stash."))
4324
4325 def _setup_input(self, use_done_feature):
4326 if self._args.stdin:
4327 self._input = sys.stdin.detach()
4328 sys.stdin = None # Make sure no one tries to accidentally use it
4329 self._fe_orig = None
4330 else:
4331 self._read_stash()
4332 skip_blobs = (self._blob_callback is None and
4333 (self._args.replace_text is None or
4334 self._file_info_callback is not None) and
4335 self._args.source == self._args.target)
4336 extra_flags = []
4337 if skip_blobs:
4338 extra_flags.append('--no-data')
4339 if self._args.max_blob_size:
4340 self._unpacked_size, packed_size = GitUtils.get_blob_sizes()
4341 if use_done_feature:
4342 extra_flags.append('--use-done-feature')
4343 if write_marks:
4344 extra_flags.append(b'--mark-tags')
4345 if self._args.state_branch:
4346 assert(write_marks)
4347 source_marks_file = self._load_marks_file(b'source-marks')
4348 extra_flags.extend([b'--export-marks='+source_marks_file,
4349 b'--import-marks='+source_marks_file])
4350 if self._args.preserve_commit_encoding is not None: # pragma: no cover
4351 reencode = 'no' if self._args.preserve_commit_encoding else 'yes'
4352 extra_flags.append('--reencode='+reencode)
4353 if self._args.date_order:
4354 extra_flags.append('--date-order')
4355 location = ['-C', self._args.source] if self._args.source else []
4356 fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids',
4357 '--signed-tags=strip', '--tag-of-filtered-object=rewrite',
4358 '--fake-missing-tagger', '--reference-excluded-parents'
4359 ] + extra_flags + self._args.refs
4360 self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE)
4361 self._input = self._fep.stdout
4362 if self._args.dry_run or self._args.debug:
4363 self._fe_orig = os.path.join(self.results_tmp_dir(),
4364 b'fast-export.original')
4365 output = open(self._fe_orig, 'bw')
4366 self._input = InputFileBackup(self._input, output)
4367 if self._args.debug:
4368 tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd]
4369 print("[DEBUG] Running: {}".format(' '.join(tmp)))
4370 print(" (saving a copy of the output at {})"
4371 .format(decode(self._fe_orig)))
4372
4373 def _setup_output(self):
4374 if not self._args.dry_run:
4375 location = ['-C', self._args.target] if self._args.target else []
4376 fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false',
4377 'fast-import', '--force', '--quiet']
4378 if date_format_permissive:
4379 fip_cmd.append('--date-format=raw-permissive')
4380 if self._args.state_branch:
4381 target_marks_file = self._load_marks_file(b'target-marks')
4382 fip_cmd.extend([b'--export-marks='+target_marks_file,
4383 b'--import-marks='+target_marks_file])
4384 self._fip = subproc.Popen(fip_cmd, bufsize=-1,
4385 stdin=subprocess.PIPE, stdout=subprocess.PIPE)
4386 self._import_pipes = (self._fip.stdin, self._fip.stdout)
4387 if self._args.dry_run or self._args.debug:
4388 self._fe_filt = os.path.join(self.results_tmp_dir(),
4389 b'fast-export.filtered')
4390 self._output = open(self._fe_filt, 'bw')
4391 else:
4392 self._output = self._fip.stdin
4393 if self._args.debug and not self._args.dry_run:
4394 self._output = DualFileWriter(self._fip.stdin, self._output)
4395 tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd]
4396 print("[DEBUG] Running: {}".format(' '.join(tmp)))
4397 print(" (using the following file as input: {})"
4398 .format(decode(self._fe_filt)))
4399
4400 def _migrate_origin_to_heads(self):
4401 source_working_dir = self._args.source or b'.'
4402 target_working_dir = self._args.target or b'.'
4403 refs_to_migrate = set(x for x in self._orig_refs
4404 if x.startswith(b'refs/remotes/origin/'))
4405 refs_to_warn_about = set()
4406 if refs_to_migrate:
4407 if self._args.debug:
4408 print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*")
4409 p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
4410 stdin=subprocess.PIPE, cwd=source_working_dir)
4411 for ref in refs_to_migrate:
4412 if ref == b'refs/remotes/origin/HEAD':
4413 p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
4414 del self._orig_refs[ref]
4415 continue
4416 newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/')
4417 if newref not in self._orig_refs:
4418 p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref]))
4419 self._orig_refs[newref] = self._orig_refs[ref]
4420 elif self._orig_refs[ref] != self._orig_refs[newref]:
4421 refs_to_warn_about.add(newref)
4422 p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
4423 del self._orig_refs[ref]
4424 p.stdin.close()
4425 if p.wait(): # pragma: no cover
4426 msg = _("git update-ref failed; see above")
4427 raise SystemExit(msg)
4428
4429 if b'remote.origin.url' not in self._config_settings:
4430 return
4431
4432 # For sensitive data removals, fetch ALL refs. Non-mirror clones normally
4433 # only grab branches and tags, but other refs may hold on to the sensitive
4434 # data as well.
4435 if self._args.sensitive_data_removal and \
4436 not self._args.no_fetch and \
4437 not self._already_ran and \
4438 self._config_settings.get(b'remote.origin.mirror', b'false') != b'true':
4439
4440 if refs_to_warn_about:
4441 msg = ("Warning: You have refs modified from upstream:\n " +
4442 "\n ".join([decode(x) for x in refs_to_warn_about]) +
4443 "\n" +
4444 " We want to forcibly fetch from upstream to ensure\n" +
4445 " that all relevent refs are rewritten, but this will\n" +
4446 " discard your local changes before starting the\n" +
4447 " rewrite. Proceed with fetch (Y/N)?")
4448 response = input(msg)
4449
4450 if response.lower() != 'y':
4451 self._args.no_fetch = True
4452 # Don't do the fetch, and don't remove the origin remote
4453 return
4454
4455 cmd = 'git fetch -q --prune --update-head-ok --refmap "" origin +refs/*:refs/*'
4456 m = _("NOTICE: Fetching all refs from origin to make sure we rewrite\n"
4457 " all history that may reference the sensitive data, via\n"
4458 " "+cmd)
4459 print(m)
4460 ret = subproc.call([arg if arg != '""' else '' for arg in cmd.split()],
4461 cwd=source_working_dir)
4462 if ret != 0: # pragma: no cover
4463 m = _("Warning: Fetching all refs from origin failed")
4464 print(m)
4465 if self._args.sensitive_data_removal:
4466 return
4467
4468 # Now remove the origin remote
4469 url = self._config_settings[b'remote.origin.url'].decode(errors='replace')
4470 m = _("NOTICE: Removing 'origin' remote; see 'Why is my origin removed?'\n"
4471 " in the manual if you want to push back there.\n"
4472 " (was %s)") % url
4473 print(m)
4474 subproc.call('git remote rm origin'.split(), cwd=target_working_dir)
4475
4476 def _final_commands(self):
4477 self._finalize_handled = True
4478 self._done_callback and self._done_callback()
4479
4480 if self._file_info_value:
4481 self._file_info_value.finalize()
4482 if not self._args.quiet:
4483 self._progress_writer.finish()
4484
4485 def _ref_update(self, target_working_dir):
4486 # Start the update-ref process
4487 p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
4488 stdin=subprocess.PIPE,
4489 cwd=target_working_dir)
4490
4491 # Remove replace_refs from _orig_refs
4492 replace_refs = {k:v for k, v in self._orig_refs.items()
4493 if k.startswith(b'refs/replace/')}
4494 reverse_replace_refs = collections.defaultdict(list)
4495 for k,v in replace_refs.items():
4496 reverse_replace_refs[v].append(k)
4497 all(map(self._orig_refs.pop, replace_refs))
4498
4499 # Remove unused refs
4500 exported_refs, imported_refs = self.get_exported_and_imported_refs()
4501 refs_to_nuke = exported_refs - imported_refs
4502 # Because revisions can be passed to fast-export which handles them as
4503 # though they were refs, we might have bad "refs" to nuke; strip them out.
4504 refs_to_nuke = [x for x in refs_to_nuke
4505 if x.startswith(b'refs/') or x == b'HEAD']
4506 if self._args.partial:
4507 refs_to_nuke = set()
4508 if refs_to_nuke and self._args.debug:
4509 print("[DEBUG] Deleting the following refs:\n "+
4510 decode(b"\n ".join(sorted(refs_to_nuke))))
4511 p.stdin.write(b''.join([b"delete %s\n" % x
4512 for x in refs_to_nuke]))
4513
4514 # Delete or update and add replace_refs; note that fast-export automatically
4515 # handles 'update-no-add', we only need to take action for the other four
4516 # choices for replace_refs.
4517 self._flush_renames()
4518 actual_renames = {k:v for k,v in self._commit_renames.items() if k != v}
4519 if self._args.replace_refs in ['delete-no-add', 'delete-and-add']:
4520 # Delete old replace refs, if unwanted
4521 replace_refs_to_nuke = set(replace_refs)
4522 if self._args.replace_refs == 'delete-and-add':
4523 # git-update-ref won't allow us to update a ref twice, so be careful
4524 # to avoid deleting refs we'll later update
4525 replace_refs_to_nuke = replace_refs_to_nuke.difference(
4526 [b'refs/replace/'+x for x in actual_renames])
4527 p.stdin.write(b''.join([b"delete %s\n" % x
4528 for x in replace_refs_to_nuke]))
4529 if self._args.replace_refs in ['delete-and-add', 'update-or-add',
4530 'update-and-add']:
4531 # Add new replace refs
4532 update_only = (self._args.replace_refs == 'update-or-add')
4533 p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new)
4534 for old,new in actual_renames.items()
4535 if new and not (update_only and
4536 old in reverse_replace_refs)]))
4537
4538 # Complete the update-ref process
4539 p.stdin.close()
4540 if p.wait():
4541 raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover
4542
4543 def _remap_to(self, oldish_hash):
4544 '''
4545 Given an oldish_hash (from the beginning of the current run), return:
4546 IF oldish_hash is NOT pruned:
4547 the hash of the rewrite of oldish_hash
4548 otherwise:
4549 the hash of the rewrite of the first unpruned ancestor of oldish_hash
4550 '''
4551 old_id = self._orig_graph._hash_to_id[oldish_hash]
4552 new_id = _IDS.translate(old_id)
4553 new_hash = self._graph.git_hash[new_id] if new_id else deleted_hash
4554 return new_hash
4555
4556 def _compute_metadata(self, metadata_dir, orig_refs):
4557 #
4558 # First, handle commit_renames
4559 #
4560 old_commit_renames = dict()
4561 if not self._already_ran:
4562 commit_renames = {old: new
4563 for old, new in self._commit_renames.items()
4564 }
4565 else:
4566 # Read commit-map into old_commit_renames
4567 with open(os.path.join(metadata_dir, b'commit-map'), 'br') as f:
4568 f.readline() # Skip the header line
4569 for line in f:
4570 (old,new) = line.split()
4571 old_commit_renames[old] = new
4572 # Use A->B mappings in old_commit_renames, and B->C mappings in
4573 # self._commit_renames to yield A->C mappings in commit_renames
4574 commit_renames = {old: self._commit_renames.get(newish, newish)
4575 for old, newish in old_commit_renames.items()}
4576 # If there are any B->C mappings in self._commit_renames for which
4577 # there was no A->B mapping in old_commit_renames, then add the
4578 # B->C mapping to commit_renames too.
4579 seen = set(old_commit_renames.values())
4580 commit_renames.update({old: new
4581 for old, new in self._commit_renames.items()
4582 if old not in seen})
4583
4584 #
4585 # Second, handle ref_maps
4586 #
4587 exported_refs, imported_refs = self.get_exported_and_imported_refs()
4588
4589 old_commit_unrenames = dict()
4590 if not self._already_ran:
4591 old_ref_map = dict((refname, (old_hash, deleted_hash))
4592 for refname, old_hash in orig_refs.items()
4593 if refname in exported_refs)
4594 else:
4595 # old_commit_renames talk about how commits were renamed in the original
4596 # run. Let's reverse it to find out how to get from the intermediate
4597 # commit name, back to the original. Because everything in orig_refs
4598 # right now refers to the intermediate commits after the first run(s),
4599 # and we need to map them back to what they were before any changes.
4600 old_commit_unrenames = dict((v,k) for (k,v) in old_commit_renames.items())
4601
4602 old_ref_map = {}
4603 # Populate old_ref_map from the 'ref-map' file
4604 with open(os.path.join(metadata_dir, b'ref-map'), 'br') as f:
4605 f.readline() # Skip the header line
4606 for line in f:
4607 (old,intermediate,ref) = line.split()
4608 old_ref_map[ref] = (old, intermediate)
4609 # Append to old_ref_map items from orig_refs that were exported, but
4610 # get the actual original commit name
4611 for refname, old_hash in orig_refs.items():
4612 if refname in old_ref_map:
4613 continue
4614 if refname not in exported_refs:
4615 continue
4616 # Compute older_hash
4617 original_hash = old_commit_unrenames.get(old_hash, old_hash)
4618 old_ref_map[refname] = (original_hash, deleted_hash)
4619
4620 new_refs = {}
4621 new_refs_initialized = False
4622 ref_maps = {}
4623 self._orig_graph._ensure_reverse_maps_populated()
4624 for refname, pair in old_ref_map.items():
4625 old_hash, hash_ref_becomes_if_not_imported_in_this_run = pair
4626 if refname not in imported_refs:
4627 new_hash = hash_ref_becomes_if_not_imported_in_this_run
4628 elif old_hash in commit_renames:
4629 intermediate = old_commit_renames.get(old_hash,old_hash)
4630 if intermediate in self._commit_renames:
4631 new_hash = self._remap_to(intermediate)
4632 else:
4633 new_hash = intermediate
4634 else: # Must be either an annotated tag, or a ref whose tip was pruned
4635 if not new_refs_initialized:
4636 target_working_dir = self._args.target or b'.'
4637 new_refs = GitUtils.get_refs(target_working_dir)
4638 new_refs_initialized = True
4639 if refname in new_refs:
4640 new_hash = new_refs[refname]
4641 else:
4642 new_hash = deleted_hash
4643 ref_maps[refname] = (old_hash, new_hash)
4644 if self._args.source or self._args.target:
4645 if not new_refs_initialized:
4646 target_working_dir = self._args.target or b'.'
4647 new_refs = GitUtils.get_refs(target_working_dir)
4648 new_refs_initialized = True
4649 for ref, new_hash in new_refs.items():
4650 if ref not in orig_refs and not ref.startswith(b'refs/replace/'):
4651 old_hash = b'0'*len(new_hash)
4652 ref_maps[ref] = (old_hash, new_hash)
4653
4654 #
4655 # Third, handle first_changes
4656 #
4657
4658 old_first_changes = dict()
4659 if self._already_ran:
4660 # Read first_changes into old_first_changes
4661 with open(os.path.join(metadata_dir, b'first-changed-commits'), 'br') as f:
4662 for line in f:
4663 changed_commit, undeleted_self_or_ancestor = line.strip().split()
4664 old_first_changes[changed_commit] = undeleted_self_or_ancestor
4665 # We need to find the commits that were modified whose parents were not.
4666 # To be able to find parents, we need the commit names as of the beginning
4667 # of this run, and then when we are done, we need to map them back to the
4668 # name of the commits from before any git-filter-repo runs.
4669 #
4670 # We are excluding here any commits deleted in previous git-filter-repo
4671 # runs
4672 undo_old_commit_renames = dict((v,k) for (k,v) in old_commit_renames.items()
4673 if v != deleted_hash)
4674 # Get a list of all commits that were changed, as of the beginning of
4675 # this latest run.
4676 changed_commits = {new
4677 for (old,new) in old_commit_renames.items()
4678 if old != new and new != deleted_hash} | \
4679 {old
4680 for (old,new) in self._commit_renames.items()
4681 if old != new}
4682 special_changed_commits = {old
4683 for (old,new) in old_commit_renames.items()
4684 if new == deleted_hash}
4685 first_changes = dict()
4686 for (old,new) in self._commit_renames.items():
4687 if old == new:
4688 # old wasn't modified, can't be first change if not even a change
4689 continue
4690 if old_commit_unrenames.get(old,old) != old:
4691 # old was already modified in previous run; while it might represent
4692 # something that is still a first change, we'll handle that as we
4693 # loop over old_first_changes below
4694 continue
4695 if any(parent in changed_commits
4696 for parent in self._orig_graph.get_parent_hashes(old)):
4697 # a parent of old was modified, so old is not a first change
4698 continue
4699 # At this point, old IS a first change. We need to find out what new
4700 # commit it maps to, or if it doesn't map to one, what new commit was
4701 # its most recent ancestor that wasn't pruned.
4702 if new is None:
4703 new = self._remap_to(old)
4704 first_changes[old] = (new if new is not None else deleted_hash)
4705 for (old,undeleted_self_or_ancestor) in old_first_changes.items():
4706 if undeleted_self_or_ancestor == deleted_hash:
4707 # old represents a commit that was pruned and whose entire ancestry
4708 # was pruned. So, old is still a first change
4709 first_changes[old] = undeleted_self_or_ancestor
4710 continue
4711 intermediate = old_commit_renames.get(old, old)
4712 usoa = undeleted_self_or_ancestor
4713 new_ancestor = self._commit_renames.get(usoa, usoa)
4714 if intermediate == deleted_hash:
4715 # old was pruned in previous rewrite
4716 if usoa != new_ancestor:
4717 # old's ancestor got rewritten in this filtering run; we can drop
4718 # this one from first_changes.
4719 continue
4720 # Getting here means old was a first change and old was pruned in a
4721 # previous run, and its ancestors that survived were non rewritten in
4722 # this run, so old remains a first change
4723 first_changes[old] = new_ancestor # or usoa, since new_ancestor == usoa
4724 continue
4725 assert(usoa == intermediate) # old wasn't pruned => usoa == intermediate
4726
4727 # Check whether parents of intermediate were rewritten. Note that
4728 # intermediate in self._commit_renames only means that intermediate was
4729 # processed by the latest filtering (not necessarily that it changed),
4730 # but we need to know that before we can check for parent hashes having
4731 # changed.
4732 if intermediate not in self._commit_renames:
4733 # This commit was not processed by this run, so it remains a first
4734 # change
4735 first_changes[old] = usoa
4736 continue
4737 if any(parent in changed_commits
4738 for parent in self._orig_graph.get_parent_hashes(intermediate)):
4739 # An ancestor was modified by this run, so it is no longer a first
4740 # change; continue to the next one.
4741 continue
4742 # This change is a first_change; find the new commit its usoa maps to
4743 new = self._remap_to(intermediate)
4744 assert(new is not None)
4745 first_changes[old] = new
4746
4747 return commit_renames, ref_maps, first_changes
4748
4749 def _handle_lfs_metadata(self, metadata_dir):
4750 if self._lfs_object_tracker is None:
4751 print("NOTE: LFS object orphaning not checked (LFS not in use)")
4752 return
4753
4754 if self._args.partial:
4755 target_working_dir = self._args.target or b'.'
4756 source = False
4757 self._lfs_object_tracker.find_all_lfs_objects_in_repo(target_working_dir,
4758 source)
4759
4760 with open(os.path.join(metadata_dir, b'original_lfs_objects'), 'bw') as f:
4761 for obj in sorted(self._lfs_object_tracker.source_objects.objects):
4762 f.write(obj+b"\n")
4763
4764 orphaned_lfs_path = os.path.join(metadata_dir, b'orphaned_lfs_objects')
4765 msg = textwrap.dedent(_(f"""\
4766 NOTE: There were LFS Objects Orphaned by this rewrite recorded in
4767 {decode(orphaned_lfs_path)}."""))
4768 with open(orphaned_lfs_path, 'bw') as f:
4769 differences = self._lfs_object_tracker.source_objects.objects - \
4770 self._lfs_object_tracker.target_objects.objects
4771 for obj in sorted(differences):
4772 f.write(obj+b"\n")
4773 if differences:
4774 self._lfs_object_tracker.objects_orphaned = True
4775 print(msg)
4776
4777 def _record_metadata(self, metadata_dir, orig_refs):
4778 self._flush_renames()
4779 commit_renames, ref_maps, first_changes = \
4780 self._compute_metadata(metadata_dir, orig_refs)
4781
4782 if self._args.sensitive_data_removal:
4783 changed_commits = sum(k!=v for (k,v) in commit_renames.items())
4784 print(f"You rewrote {changed_commits} (of {len(commit_renames)}) commits.")
4785 print("") # Add a blank line before important rewrite information
4786 print(f"NOTE: First Changed Commit(s) is/are:\n "
4787 + decode(b"\n ".join(x for x in first_changes)))
4788
4789 with open(os.path.join(metadata_dir, b'sensitive_data_removal'), 'bw') as f:
4790 pass # Write nothing; we only need the file created
4791
4792 self._handle_lfs_metadata(metadata_dir)
4793 print("") # Add a blank line after important rewrite information
4794
4795 with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f:
4796 f.write(("%-40s %s\n" % (_("old"), _("new"))).encode())
4797 for (old,new) in sorted(commit_renames.items()):
4798 msg = b'%s %s\n' % (old, new if new != None else deleted_hash)
4799 f.write(msg)
4800
4801 with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f:
4802 f.write(("%-40s %-40s %s\n" % (_("old"), _("new"), _("ref"))).encode())
4803 for refname, hash_pair in sorted(ref_maps.items()):
4804 (old_hash, new_hash) = hash_pair
4805 f.write(b'%s %s %s\n' % (old_hash, new_hash, refname))
4806 if old_hash != new_hash:
4807 self._changed_refs.add(refname)
4808
4809 with open(os.path.join(metadata_dir, b'changed-refs'), 'bw') as f:
4810 for refname in sorted(self._changed_refs):
4811 f.write(b'%s\n' % refname)
4812
4813 with open(os.path.join(metadata_dir, b'first-changed-commits'), 'bw') as f:
4814 for commit, undeleted_self_or_ancestor in sorted(first_changes.items()):
4815 f.write(b'%s %s\n' % (commit, undeleted_self_or_ancestor))
4816
4817 with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f:
4818 issues_found = False
4819 if self._commits_no_longer_merges:
4820 issues_found = True
4821
4822 f.write(textwrap.dedent(_('''
4823 The following commits used to be merge commits but due to filtering
4824 are now regular commits; they likely have suboptimal commit messages
4825 (e.g. "Merge branch next into master"). Original commit hash on the
4826 left, commit hash after filtering/rewriting on the right:
4827 ''')[1:]).encode())
4828 for oldhash, newhash in self._commits_no_longer_merges:
4829 f.write(' {} {}\n'.format(oldhash, newhash).encode())
4830 f.write(b'\n')
4831
4832 if self._commits_referenced_but_removed:
4833 issues_found = True
4834 f.write(textwrap.dedent(_('''
4835 The following commits were filtered out, but referenced in another
4836 commit message. The reference to the now-nonexistent commit hash
4837 (or a substring thereof) was left as-is in any commit messages:
4838 ''')[1:]).encode())
4839 for bad_commit_reference in self._commits_referenced_but_removed:
4840 f.write(' {}\n'.format(bad_commit_reference).encode())
4841 f.write(b'\n')
4842
4843 if not issues_found:
4844 f.write(_("No filtering problems encountered.\n").encode())
4845
4846 with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f:
4847 f.write(_("This file exists to allow you to filter again without --force,\n"
4848 "and to specify that metadata files should be updated instead\n"
4849 "of rewritten").encode())
4850
4851 def finish(self):
4852 ''' Alternative to run() when there is no input of our own to parse,
4853 meaning that run only really needs to close the handle to fast-import
4854 and let it finish, thus making a call to "run" feel like a misnomer. '''
4855 assert not self._input
4856 assert self._managed_output
4857 self.run()
4858
4859 def insert(self, obj, direct_insertion = False):
4860 if not direct_insertion:
4861 if type(obj) == Blob:
4862 self._tweak_blob(obj)
4863 elif type(obj) == Commit:
4864 aux_info = {'orig_parents': obj.parents,
4865 'had_file_changes': bool(obj.file_changes)}
4866 self._tweak_commit(obj, aux_info)
4867 elif type(obj) == Reset:
4868 self._tweak_reset(obj)
4869 elif type(obj) == Tag:
4870 self._tweak_tag(obj)
4871 self._insert_into_stream(obj)
4872
4873 def _insert_into_stream(self, obj):
4874 if not obj.dumped:
4875 if self._lfs_object_tracker:
4876 self._lfs_object_tracker.check_output_object(obj)
4877 if self._parser:
4878 self._parser.insert(obj)
4879 else:
4880 obj.dump(self._output)
4881
4882 def get_exported_and_imported_refs(self):
4883 return self._parser.get_exported_and_imported_refs()
4884
4885 def run(self):
4886 start = time.time()
4887 if not self._input and not self._output:
4888 self._run_sanity_checks()
4889 if not self._args.dry_run and not self._args.partial:
4890 self._read_stash()
4891 self._migrate_origin_to_heads()
4892 self._setup_input(use_done_feature = True)
4893 self._setup_output()
4894 assert self._sanity_checks_handled
4895
4896 if self._input:
4897 # Create and run the filter
4898 self._repo_working_dir = self._args.source or b'.'
4899 self._parser = FastExportParser(blob_callback = self._tweak_blob,
4900 commit_callback = self._tweak_commit,
4901 tag_callback = self._tweak_tag,
4902 reset_callback = self._tweak_reset,
4903 done_callback = self._final_commands)
4904 self._setup_lfs_orphaning_checks()
4905 self._parser.run(self._input, self._output)
4906 if not self._finalize_handled:
4907 self._final_commands()
4908
4909 # Make sure fast-export completed successfully
4910 if not self._args.stdin and self._fep.wait():
4911 raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover
4912 self._input.close()
4913
4914 # If we're not the manager of self._output, we should avoid post-run cleanup
4915 if not self._managed_output:
4916 return
4917
4918 # Close the output and ensure fast-import successfully completes
4919 self._output.close()
4920 if not self._args.dry_run and self._fip.wait():
4921 raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover
4922
4923 # With fast-export and fast-import complete, update state if requested
4924 if self._args.state_branch:
4925 self._save_marks_files()
4926
4927 # Notify user how long it took, before doing a gc and such
4928 msg = "New history written in {:.2f} seconds..."
4929 if self._args.repack:
4930 msg = "New history written in {:.2f} seconds; now repacking/cleaning..."
4931 print(msg.format(time.time()-start))
4932
4933 # Exit early, if requested
4934 if self._args.dry_run:
4935 print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed."))
4936 if self._fe_orig:
4937 print(_(" Requested filtering can be seen by comparing:"))
4938 print(" " + decode(self._fe_orig))
4939 else:
4940 print(_(" Requested filtering can be seen at:"))
4941 print(" " + decode(self._fe_filt))
4942 return
4943
4944 target_working_dir = self._args.target or b'.'
4945 if self._input:
4946 self._ref_update(target_working_dir)
4947
4948 # Write out data about run
4949 self._record_metadata(self.results_tmp_dir(), self._orig_refs)
4950
4951 # Final cleanup:
4952 # If we need a repack, then nuke the reflogs and repack.
4953 # If we need a reset, do a reset --hard
4954 reset = not GitUtils.is_repository_bare(target_working_dir)
4955 self.cleanup(target_working_dir, self._args.repack, reset,
4956 run_quietly=self._args.quiet,
4957 show_debuginfo=self._args.debug)
4958
4959 # Let user know how long it took
4960 print(_("Completely finished after {:.2f} seconds.")
4961 .format(time.time()-start))
4962
4963 # Give post-rewrite instructions for cleaning up other copies for SDR
4964 if self._args.sensitive_data_removal:
4965 lfs_note = ""
4966 if self._lfs_object_tracker and \
4967 self._lfs_object_tracker.objects_orphaned == True:
4968 lfs_note = _(" and LFS Objects Orphaned")
4969 push_command = "git push --force --mirror origin"
4970 if self._args.no_fetch:
4971 if self._args.partial:
4972 push_command = "git push --force origin " + \
4973 " ".join(sorted([decode(x) for x in self._changed_refs]))
4974 else:
4975 push_command = "git push --all --tags origin"
4976 print("")
4977 print(sdr_next_steps % (push_command, lfs_note, lfs_note))
4978
4979def main():
4980 setup_gettext()
4981 args = FilteringOptions.parse_args(sys.argv[1:])
4982 if args.analyze:
4983 RepoAnalyze.run(args)
4984 else:
4985 filter = RepoFilter(args)
4986 filter.run()
4987
4988if __name__ == '__main__':
4989 main()