git-filter-repo at main · anil.recoil.org/unpac

A monorepo management tool for the agentic ages
unpac / git-filter-repo
at main 4989 lines 212 kB view raw
wrap content
   1#!/usr/bin/env python3
   2
   3"""
   4git-filter-repo filters git repositories, similar to git filter-branch, BFG
   5repo cleaner, and others.  The basic idea is that it works by running
   6   git fast-export <options> | filter | git fast-import <options>
   7where this program not only launches the whole pipeline but also serves as
   8the 'filter' in the middle.  It does a few additional things on top as well
   9in order to make it into a well-rounded filtering tool.
  10
  11git-filter-repo can also be used as a library for more involved filtering
  12operations; however:
  13  ***** API BACKWARD COMPATIBILITY CAVEAT *****
  14  Programs using git-filter-repo as a library can reach pretty far into its
  15  internals, but I am not prepared to guarantee backward compatibility of
  16  all APIs.  I suspect changes will be rare, but I reserve the right to
  17  change any API.  Since it is assumed that repository filtering is
  18  something one would do very rarely, and in particular that it's a
  19  one-shot operation, this should not be a problem in practice for anyone.
  20  However, if you want to re-use a program you have written that uses
  21  git-filter-repo as a library (or makes use of one of its --*-callback
  22  arguments), you should either make sure you are using the same version of
  23  git and git-filter-repo, or make sure to re-test it.
  24
  25  If there are particular pieces of the API you are concerned about, and
  26  there is not already a testcase for it in t9391-lib-usage.sh or
  27  t9392-python-callback.sh, please contribute a testcase.  That will not
  28  prevent me from changing the API, but it will allow you to look at the
  29  history of a testcase to see whether and how the API changed.
  30  ***** END API BACKWARD COMPATIBILITY CAVEAT *****
  31"""
  32
  33import argparse
  34import collections
  35import fnmatch
  36import gettext
  37import io
  38import os
  39import platform
  40import re
  41import shutil
  42import subprocess
  43import sys
  44import time
  45import textwrap
  46
  47from datetime import tzinfo, timedelta, datetime
  48
  49__all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress",
  50           "Checkpoint", "FastExportParser", "ProgressWriter",
  51           "string_to_date", "date_to_string",
  52           "record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"]
  53
  54# The globals to make visible to callbacks. They will see all our imports for
  55# free, as well as our public API.
  56public_globals = ["__builtins__", "argparse", "collections", "fnmatch",
  57                  "gettext", "io", "os", "platform", "re", "shutil",
  58                  "subprocess", "sys", "time", "textwrap", "tzinfo",
  59                  "timedelta", "datetime"] + __all__
  60
  61deleted_hash = b'0'*40
  62write_marks = True
  63date_format_permissive = True
  64
  65def gettext_poison(msg):
  66  if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover
  67    return "# GETTEXT POISON #"
  68  return gettext.gettext(msg)
  69
  70_ = gettext_poison
  71
  72def setup_gettext():
  73  TEXTDOMAIN="git-filter-repo"
  74  podir = os.environ.get("GIT_TEXTDOMAINDIR") or "@@LOCALEDIR@@"
  75  if not os.path.isdir(podir): # pragma: no cover
  76    podir = None  # Python has its own fallback; use that
  77
  78  ## This looks like the most straightforward translation of the relevant
  79  ## code in git.git:gettext.c and git.git:perl/Git/I18n.pm:
  80  #import locale
  81  #locale.setlocale(locale.LC_MESSAGES, "");
  82  #locale.setlocale(locale.LC_TIME, "");
  83  #locale.textdomain(TEXTDOMAIN);
  84  #locale.bindtextdomain(TEXTDOMAIN, podir);
  85  ## but the python docs suggest using the gettext module (which doesn't
  86  ## have setlocale()) instead, so:
  87  gettext.textdomain(TEXTDOMAIN);
  88  gettext.bindtextdomain(TEXTDOMAIN, podir);
  89
  90def _timedelta_to_seconds(delta):
  91  """
  92  Converts timedelta to seconds
  93  """
  94  offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000
  95  return round(offset)
  96
  97class FixedTimeZone(tzinfo):
  98  """
  99  Fixed offset in minutes east from UTC.
 100  """
 101
 102  tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$')
 103
 104  def __init__(self, offset_string):
 105    tzinfo.__init__(self)
 106    sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups()
 107    factor = -1 if (sign and sign == b'-') else 1
 108    self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm)))
 109    self._offset_string = offset_string
 110
 111  def utcoffset(self, dt):
 112    return self._offset
 113
 114  def tzname(self, dt):
 115    return self._offset_string
 116
 117  def dst(self, dt):
 118    return timedelta(0)
 119
 120def string_to_date(datestring):
 121  (unix_timestamp, tz_offset) = datestring.split()
 122  return datetime.fromtimestamp(int(unix_timestamp),
 123                                FixedTimeZone(tz_offset))
 124
 125def date_to_string(dateobj):
 126  epoch = datetime.fromtimestamp(0, dateobj.tzinfo)
 127  return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)),
 128                     dateobj.tzinfo.tzname(0)))
 129
 130def decode(bytestr):
 131  'Try to convert bytestr to utf-8 for outputting as an error message.'
 132  return bytestr.decode('utf-8', 'backslashreplace')
 133
 134def glob_to_regex(glob_bytestr):
 135  'Translate glob_bytestr into a regex on bytestrings'
 136
 137  # fnmatch.translate is idiotic and won't accept bytestrings
 138  if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover
 139    raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr))
 140
 141  # Create regex operating on string
 142  regex = fnmatch.translate(decode(glob_bytestr))
 143
 144  # FIXME: This is an ugly hack...
 145  # fnmatch.translate tries to do multi-line matching and wants the glob to
 146  # match up to the end of the input, which isn't relevant for us, so we
 147  # have to modify the regex.  fnmatch.translate has used different regex
 148  # constructs to achieve this with different python versions, so we have
 149  # to check for each of them and then fix it up.  It would be much better
 150  # if fnmatch.translate could just take some flags to allow us to specify
 151  # what we want rather than employing this hackery, but since it
 152  # doesn't...
 153  if regex.endswith(r'\Z(?ms)'): # pragma: no cover
 154    regex = regex[0:-7]
 155  elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover
 156    regex = regex[4:-3]
 157  elif regex.startswith(r'(?s:') and regex.endswith(r')\z'): # pragma: no cover
 158    # Yaay, python3.14 for senselessly duplicating \Z as \z...
 159    regex = regex[4:-3]
 160
 161  # Finally, convert back to regex operating on bytestr
 162  return regex.encode()
 163
 164class PathQuoting:
 165  _unescape = {b'a': b'\a',
 166               b'b': b'\b',
 167               b'f': b'\f',
 168               b'n': b'\n',
 169               b'r': b'\r',
 170               b't': b'\t',
 171               b'v': b'\v',
 172               b'"': b'"',
 173               b'\\':b'\\'}
 174  _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})')
 175  _escape = [bytes([x]) for x in range(127)]+[
 176             b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)]
 177  _reverse = dict(map(reversed, _unescape.items()))
 178  for x in _reverse:
 179    _escape[ord(x)] = b'\\'+_reverse[x]
 180  _special_chars = [len(x) > 1 for x in _escape]
 181
 182  @staticmethod
 183  def unescape_sequence(orig):
 184    seq = orig.group(1)
 185    return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)])
 186
 187  @staticmethod
 188  def dequote(quoted_string):
 189    if quoted_string.startswith(b'"'):
 190      assert quoted_string.endswith(b'"')
 191      return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence,
 192                                          quoted_string[1:-1])
 193    return quoted_string
 194
 195  @staticmethod
 196  def enquote(unquoted_string):
 197    # Option 1: Quoting when fast-export would:
 198    #    pqsc = PathQuoting._special_chars
 199    #    if any(pqsc[x] for x in set(unquoted_string)):
 200    # Option 2, perf hack: do minimal amount of quoting required by fast-import
 201    if unquoted_string.startswith(b'"') or b'\n' in unquoted_string:
 202      pqe = PathQuoting._escape
 203      return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"'
 204    return unquoted_string
 205
 206class AncestryGraph(object):
 207  """
 208  A class that maintains a direct acycle graph of commits for the purpose of
 209  determining if one commit is the ancestor of another.
 210
 211  A note about identifiers in Commit objects:
 212    * Commit objects have 2 identifiers: commit.old_id and commit.id, because:
 213    * The original fast-export stream identified commits by an identifier.
 214      This is often an integer, but is sometimes a hash (particularly when
 215      --reference-excluded-parents is provided)
 216    * The new fast-import stream we use may not use the same identifiers.
 217      If new blobs or commits are inserted (such as lint-history does), then
 218      the integer (or hash) are no longer valid.
 219
 220  A note about identifiers in AncestryGraph objects, of which there are three:
 221    * A given AncestryGraph is based on either commit.old_id or commit.id, but
 222      not both.  These are the keys for self.value.
 223    * Using full hashes (occasionally) for children in self.graph felt
 224      wasteful, so we use our own internal integer within self.graph.
 225      self.value maps from commit {old_}id to our internal integer id.
 226    * When working with commit.old_id, it is also sometimes useful to be able
 227      to map these to the original hash, i.e. commit.original_id.  So, we
 228      also have self.git_hash for mapping from commit.old_id to git's commit
 229      hash.
 230  """
 231
 232  def __init__(self):
 233    # The next internal identifier we will use; increments with every commit
 234    # added to the AncestryGraph
 235    self.cur_value = 0
 236
 237    # A mapping from the external identifers given to us to the simple integers
 238    # we use in self.graph
 239    self.value = {}
 240
 241    # A tuple of (depth, list-of-ancestors).  Values and keys in this graph are
 242    # all integers from the (values of the) self.value dict.  The depth of a
 243    # commit is one more than the max depth of any of its ancestors.
 244    self.graph = {}
 245
 246    # A mapping from external identifier (i.e. from the keys of self.value) to
 247    # the hash of the given commit.  Only populated for graphs based on
 248    # commit.old_id, since we won't know until later what the git_hash for
 249    # graphs based on commit.id (since we have to wait for fast-import to
 250    # create the commit and notify us of its hash; see _pending_renames).
 251    # elsewhere
 252    self.git_hash = {}
 253
 254    # Reverse maps; only populated if needed.  Caller responsible to check
 255    # and ensure they are populated
 256    self._reverse_value = {}
 257    self._hash_to_id = {}
 258
 259    # Cached results from previous calls to is_ancestor().
 260    self._cached_is_ancestor = {}
 261
 262  def record_external_commits(self, external_commits):
 263    """
 264    Record in graph that each commit in external_commits exists, and is
 265    treated as a root commit with no parents.
 266    """
 267    for c in external_commits:
 268      if c not in self.value:
 269        self.cur_value += 1
 270        self.value[c] = self.cur_value
 271        self.graph[self.cur_value] = (1, [])
 272        self.git_hash[c] = c
 273
 274  def add_commit_and_parents(self, commit, parents, githash = None):
 275    """
 276    Record in graph that commit has the given parents (all identified by
 277    fast export stream identifiers, usually integers but sometimes hashes).
 278    parents _MUST_ have been first recorded.  commit _MUST_ not have been
 279    recorded yet.  Also, record the mapping between commit and githash, if
 280    githash is given.
 281    """
 282    assert all(p in self.value for p in parents)
 283    assert commit not in self.value
 284
 285    # Get values for commit and parents
 286    self.cur_value += 1
 287    self.value[commit] = self.cur_value
 288    if githash:
 289      self.git_hash[commit] = githash
 290    graph_parents = [self.value[x] for x in parents]
 291
 292    # Determine depth for commit, then insert the info into the graph
 293    depth = 1
 294    if parents:
 295      depth += max(self.graph[p][0] for p in graph_parents)
 296    self.graph[self.cur_value] = (depth, graph_parents)
 297
 298  def record_hash(self, commit_id, githash):
 299    '''
 300    If a githash was not recorded for commit_id, when add_commit_and_parents
 301    was called, add it now.
 302    '''
 303    assert commit_id in self.value
 304    assert commit_id not in self.git_hash
 305    self.git_hash[commit_id] = githash
 306
 307  def _ensure_reverse_maps_populated(self):
 308    if not self._hash_to_id:
 309      assert not self._reverse_value
 310      self._hash_to_id = {v: k for k, v in self.git_hash.items()}
 311      self._reverse_value = {v: k for k, v in self.value.items()}
 312
 313  def get_parent_hashes(self, commit_hash):
 314    '''
 315    Given a commit_hash, return its parents hashes
 316    '''
 317    #
 318    # We have to map:
 319    #    commit hash -> fast export stream id -> graph id
 320    # then lookup
 321    #    parent graph ids for given graph id
 322    # then we need to map
 323    #    parent graph ids -> parent fast export ids -> parent commit hashes
 324    #
 325    self._ensure_reverse_maps_populated()
 326    commit_fast_export_id = self._hash_to_id[commit_hash]
 327    commit_graph_id = self.value[commit_fast_export_id]
 328    parent_graph_ids = self.graph[commit_graph_id][1]
 329    parent_fast_export_ids = [self._reverse_value[x] for x in parent_graph_ids]
 330    parent_hashes = [self.git_hash[x] for x in parent_fast_export_ids]
 331    return parent_hashes
 332
 333  def map_to_hash(self, commit_id):
 334    '''
 335    Given a commit (by fast export stream id), return its hash
 336    '''
 337    return self.git_hash.get(commit_id, None)
 338
 339  def is_ancestor(self, possible_ancestor, check):
 340    """
 341    Return whether possible_ancestor is an ancestor of check
 342    """
 343    a, b = self.value[possible_ancestor], self.value[check]
 344    original_pair = (a,b)
 345    a_depth = self.graph[a][0]
 346    ancestors = [b]
 347    visited = set()
 348    while ancestors:
 349      ancestor = ancestors.pop()
 350      prev_pair = (a, ancestor)
 351      if prev_pair in self._cached_is_ancestor:
 352        if not self._cached_is_ancestor[prev_pair]:
 353          continue
 354        self._cached_is_ancestor[original_pair] = True
 355        return True
 356      if ancestor in visited:
 357        continue
 358      visited.add(ancestor)
 359      depth, more_ancestors = self.graph[ancestor]
 360      if ancestor == a:
 361        self._cached_is_ancestor[original_pair] = True
 362        return True
 363      elif depth <= a_depth:
 364        continue
 365      ancestors.extend(more_ancestors)
 366    self._cached_is_ancestor[original_pair] = False
 367    return False
 368
 369class MailmapInfo(object):
 370  def __init__(self, filename):
 371    self.changes = {}
 372    self._parse_file(filename)
 373
 374  def _parse_file(self, filename):
 375    name_and_email_re = re.compile(br'(.*?)\s*<([^>]*)>\s*')
 376    comment_re = re.compile(br'\s*#.*')
 377    if not os.access(filename, os.R_OK):
 378      raise SystemExit(_("Cannot read %s") % decode(filename))
 379    with open(filename, 'br') as f:
 380      count = 0
 381      for line in f:
 382        count += 1
 383        err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line)
 384        # Remove comments
 385        line = comment_re.sub(b'', line)
 386        # Remove leading and trailing whitespace
 387        line = line.strip()
 388        if not line:
 389          continue
 390
 391        m = name_and_email_re.match(line)
 392        if not m:
 393          raise SystemExit(err)
 394        proper_name, proper_email = m.groups()
 395        if len(line) == m.end():
 396          self.changes[(None, proper_email)] = (proper_name, proper_email)
 397          continue
 398        rest = line[m.end():]
 399        m = name_and_email_re.match(rest)
 400        if m:
 401          commit_name, commit_email = m.groups()
 402          if len(rest) != m.end():
 403            raise SystemExit(err)
 404        else:
 405          commit_name, commit_email = rest, None
 406        self.changes[(commit_name, commit_email)] = (proper_name, proper_email)
 407
 408  def translate(self, name, email):
 409    ''' Given a name and email, return the expected new name and email from the
 410        mailmap if there is a translation rule for it, otherwise just return
 411        the given name and email.'''
 412    for old, new in self.changes.items():
 413      old_name, old_email = old
 414      new_name, new_email = new
 415      if (old_email is None or email.lower() == old_email.lower()) and (
 416          name == old_name or not old_name):
 417        return (new_name or name, new_email or email)
 418    return (name, email)
 419
 420class ProgressWriter(object):
 421  def __init__(self):
 422    self._last_progress_update = time.time()
 423    self._last_message = None
 424
 425  def show(self, msg):
 426    self._last_message = msg
 427    now = time.time()
 428    if now - self._last_progress_update > .1:
 429      self._last_progress_update = now
 430      sys.stdout.write("\r{}".format(msg))
 431      sys.stdout.flush()
 432
 433  def finish(self):
 434    self._last_progress_update = 0
 435    if self._last_message:
 436      self.show(self._last_message)
 437    sys.stdout.write("\n")
 438
 439class _IDs(object):
 440  """
 441  A class that maintains the 'name domain' of all the 'marks' (short int
 442  id for a blob/commit git object). There are two reasons this mechanism
 443  is necessary:
 444    (1) the output text of fast-export may refer to an object using a different
 445        mark than the mark that was assigned to that object using IDS.new().
 446        (This class allows you to translate the fast-export marks, "old" to
 447         the marks assigned from IDS.new(), "new").
 448    (2) when we prune a commit, its "old" id becomes invalid.  Any commits
 449        which had that commit as a parent needs to use the nearest unpruned
 450        ancestor as its parent instead.
 451
 452  Note that for purpose (1) above, this typically comes about because the user
 453  manually creates Blob or Commit objects (for insertion into the stream).
 454  It could also come about if we attempt to read the data from two different
 455  repositories and trying to combine the data (git fast-export will number ids
 456  from 1...n, and having two 1's, two 2's, two 3's, causes issues; granted, we
 457  this scheme doesn't handle the two streams perfectly either, but if the first
 458  fast export stream is entirely processed and handled before the second stream
 459  is started, this mechanism may be sufficient to handle it).
 460  """
 461
 462  def __init__(self):
 463    """
 464    Init
 465    """
 466    # The id for the next created blob/commit object
 467    self._next_id = 1
 468
 469    # A map of old-ids to new-ids (1:1 map)
 470    self._translation = {}
 471
 472    # A map of new-ids to every old-id that points to the new-id (1:N map)
 473    self._reverse_translation = {}
 474
 475  def has_renames(self):
 476    """
 477    Return whether there have been ids remapped to new values
 478    """
 479    return bool(self._translation)
 480
 481  def new(self):
 482    """
 483    Should be called whenever a new blob or commit object is created. The
 484    returned value should be used as the id/mark for that object.
 485    """
 486    rv = self._next_id
 487    self._next_id += 1
 488    return rv
 489
 490  def record_rename(self, old_id, new_id, handle_transitivity = False):
 491    """
 492    Record that old_id is being renamed to new_id.
 493    """
 494    if old_id != new_id or old_id in self._translation:
 495      # old_id -> new_id
 496      self._translation[old_id] = new_id
 497
 498      # Transitivity will be needed if new commits are being inserted mid-way
 499      # through a branch.
 500      if handle_transitivity:
 501        # Anything that points to old_id should point to new_id
 502        if old_id in self._reverse_translation:
 503          for id_ in self._reverse_translation[old_id]:
 504            self._translation[id_] = new_id
 505
 506      # Record that new_id is pointed to by old_id
 507      if new_id not in self._reverse_translation:
 508        self._reverse_translation[new_id] = []
 509      self._reverse_translation[new_id].append(old_id)
 510
 511  def translate(self, old_id):
 512    """
 513    If old_id has been mapped to an alternate id, return the alternate id.
 514    """
 515    if old_id in self._translation:
 516      return self._translation[old_id]
 517    else:
 518      return old_id
 519
 520  def __str__(self):
 521    """
 522    Convert IDs to string; used for debugging
 523    """
 524    rv = "Current count: %d\nTranslation:\n" % self._next_id
 525    for k in sorted(self._translation):
 526      rv += "  %d -> %s\n" % (k, self._translation[k])
 527
 528    rv += "Reverse translation:\n"
 529    reverse_keys = list(self._reverse_translation.keys())
 530    if None in reverse_keys: # pragma: no cover
 531      reverse_keys.remove(None)
 532      reverse_keys = sorted(reverse_keys)
 533      reverse_keys.append(None)
 534    for k in reverse_keys:
 535      rv += "  " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n"
 536
 537    return rv
 538
 539class _GitElement(object):
 540  """
 541  The base class for all git elements that we create.
 542  """
 543
 544  def __init__(self):
 545    # A string that describes what type of Git element this is
 546    self.type = None
 547
 548    # A flag telling us if this Git element has been dumped
 549    # (i.e. printed) or skipped.  Typically elements that have been
 550    # dumped or skipped will not be dumped again.
 551    self.dumped = 0
 552
 553  def dump(self, file_):
 554    """
 555    This version should never be called. Derived classes need to
 556    override! We should note that subclasses should implement this
 557    method such that the output would match the format produced by
 558    fast-export.
 559    """
 560    raise SystemExit(_("Unimplemented function: %s") % type(self).__name__
 561                     +".dump()") # pragma: no cover
 562
 563  def __bytes__(self):
 564    """
 565    Convert GitElement to bytestring; used for debugging
 566    """
 567    old_dumped = self.dumped
 568    writeme = io.BytesIO()
 569    self.dump(writeme)
 570    output_lines = writeme.getvalue().splitlines()
 571    writeme.close()
 572    self.dumped = old_dumped
 573    return b"%s:\n  %s" % (type(self).__name__.encode(),
 574                           b"\n  ".join(output_lines))
 575
 576  def skip(self, new_id=None):
 577    """
 578    Ensures this element will not be written to output
 579    """
 580    self.dumped = 2
 581
 582class _GitElementWithId(_GitElement):
 583  """
 584  The base class for Git elements that have IDs (commits and blobs)
 585  """
 586
 587  def __init__(self):
 588    _GitElement.__init__(self)
 589
 590    # The mark (short, portable id) for this element
 591    self.id = _IDS.new()
 592
 593    # The previous mark for this element
 594    self.old_id = None
 595
 596  def skip(self, new_id=None):
 597    """
 598    This element will no longer be automatically written to output. When a
 599    commit gets skipped, it's ID will need to be translated to that of its
 600    parent.
 601    """
 602    self.dumped = 2
 603
 604    _IDS.record_rename(self.old_id or self.id, new_id)
 605
 606class Blob(_GitElementWithId):
 607  """
 608  This class defines our representation of git blob elements (i.e. our
 609  way of representing file contents).
 610  """
 611
 612  def __init__(self, data, original_id = None):
 613    _GitElementWithId.__init__(self)
 614
 615    # Denote that this is a blob
 616    self.type = 'blob'
 617
 618    # Record original id
 619    self.original_id = original_id
 620
 621    # Stores the blob's data
 622    assert(type(data) == bytes)
 623    self.data = data
 624
 625  def dump(self, file_):
 626    """
 627    Write this blob element to a file.
 628    """
 629    self.dumped = 1
 630    BLOB_HASH_TO_NEW_ID[self.original_id] = self.id
 631    BLOB_NEW_ID_TO_HASH[self.id] = self.original_id
 632
 633    file_.write(b'blob\n')
 634    file_.write(b'mark :%d\n' % self.id)
 635    file_.write(b'data %d\n%s' % (len(self.data), self.data))
 636    file_.write(b'\n')
 637
 638
 639class Reset(_GitElement):
 640  """
 641  This class defines our representation of git reset elements.  A reset
 642  event is the creation (or recreation) of a named branch, optionally
 643  starting from a specific revision).
 644  """
 645
 646  def __init__(self, ref, from_ref = None):
 647    _GitElement.__init__(self)
 648
 649    # Denote that this is a reset
 650    self.type = 'reset'
 651
 652    # The name of the branch being (re)created
 653    self.ref = ref
 654
 655    # Some reference to the branch/commit we are resetting from
 656    self.from_ref = from_ref
 657
 658  def dump(self, file_):
 659    """
 660    Write this reset element to a file
 661    """
 662    self.dumped = 1
 663
 664    file_.write(b'reset %s\n' % self.ref)
 665    if self.from_ref:
 666      if isinstance(self.from_ref, int):
 667        file_.write(b'from :%d\n' % self.from_ref)
 668      else:
 669        file_.write(b'from %s\n' % self.from_ref)
 670      file_.write(b'\n')
 671
 672class FileChange(_GitElement):
 673  """
 674  This class defines our representation of file change elements. File change
 675  elements are components within a Commit element.
 676  """
 677
 678  def __init__(self, type_, filename = None, id_ = None, mode = None):
 679    _GitElement.__init__(self)
 680
 681    # Denote the type of file-change (b'M' for modify, b'D' for delete, etc)
 682    # We could
 683    #   assert(type(type_) == bytes)
 684    # here but I don't just due to worries about performance overhead...
 685    self.type = type_
 686
 687    # Record the name of the file being changed
 688    self.filename = filename
 689
 690    # Record the mode (mode describes type of file entry (non-executable,
 691    # executable, or symlink)).
 692    self.mode = mode
 693
 694    # blob_id is the id (mark) of the affected blob
 695    self.blob_id = id_
 696
 697    if type_ == b'DELETEALL':
 698      assert filename is None and id_ is None and mode is None
 699      self.filename = b'' # Just so PathQuoting.enquote doesn't die
 700    else:
 701      assert filename is not None
 702
 703    if type_ == b'M':
 704      assert id_ is not None and mode is not None
 705    elif type_ == b'D':
 706      assert id_ is None and mode is None
 707    elif type_ == b'R':  # pragma: no cover (now avoid fast-export renames)
 708      assert mode is None
 709      if id_ is None:
 710        raise SystemExit(_("new name needed for rename of %s") % filename)
 711      self.filename = (self.filename, id_)
 712      self.blob_id = None
 713
 714  def dump(self, file_):
 715    """
 716    Write this file-change element to a file
 717    """
 718    skipped_blob = (self.type == b'M' and self.blob_id is None)
 719    if skipped_blob: return
 720    self.dumped = 1
 721
 722    quoted_filename = PathQuoting.enquote(self.filename)
 723    if self.type == b'M' and isinstance(self.blob_id, int):
 724      file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename))
 725    elif self.type == b'M':
 726      file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename))
 727    elif self.type == b'D':
 728      file_.write(b'D %s\n' % quoted_filename)
 729    elif self.type == b'DELETEALL':
 730      file_.write(b'deleteall\n')
 731    else:
 732      raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover
 733
 734class Commit(_GitElementWithId):
 735  """
 736  This class defines our representation of commit elements. Commit elements
 737  contain all the information associated with a commit.
 738  """
 739
 740  def __init__(self, branch,
 741               author_name,    author_email,    author_date,
 742               committer_name, committer_email, committer_date,
 743               message,
 744               file_changes,
 745               parents,
 746               original_id = None,
 747               encoding = None, # encoding for message; None implies UTF-8
 748               **kwargs):
 749    _GitElementWithId.__init__(self)
 750    self.old_id = self.id
 751
 752    # Denote that this is a commit element
 753    self.type = 'commit'
 754
 755    # Record the affected branch
 756    self.branch = branch
 757
 758    # Record original id
 759    self.original_id = original_id
 760
 761    # Record author's name
 762    self.author_name  = author_name
 763
 764    # Record author's email
 765    self.author_email = author_email
 766
 767    # Record date of authoring
 768    self.author_date  = author_date
 769
 770    # Record committer's name
 771    self.committer_name  = committer_name
 772
 773    # Record committer's email
 774    self.committer_email = committer_email
 775
 776    # Record date the commit was made
 777    self.committer_date  = committer_date
 778
 779    # Record commit message and its encoding
 780    self.encoding = encoding
 781    self.message = message
 782
 783    # List of file-changes associated with this commit. Note that file-changes
 784    # are also represented as git elements
 785    self.file_changes = file_changes
 786
 787    self.parents = parents
 788
 789  def dump(self, file_):
 790    """
 791    Write this commit element to a file.
 792    """
 793    self.dumped = 1
 794
 795    # Make output to fast-import slightly easier for humans to read if the
 796    # message has no trailing newline of its own; cosmetic, but a nice touch...
 797    extra_newline = b'\n'
 798    if self.message.endswith(b'\n') or not (self.parents or self.file_changes):
 799      extra_newline = b''
 800
 801    if not self.parents:
 802      file_.write(b'reset %s\n' % self.branch)
 803    file_.write((b'commit %s\n'
 804                 b'mark :%d\n'
 805                 b'author %s <%s> %s\n'
 806                 b'committer %s <%s> %s\n'
 807                ) % (
 808                  self.branch, self.id,
 809                  self.author_name, self.author_email, self.author_date,
 810                  self.committer_name, self.committer_email, self.committer_date
 811               ))
 812    if self.encoding:
 813      file_.write(b'encoding %s\n' % self.encoding)
 814    file_.write(b'data %d\n%s%s' %
 815                (len(self.message), self.message, extra_newline))
 816    for i, parent in enumerate(self.parents):
 817      file_.write(b'from ' if i==0 else b'merge ')
 818      if isinstance(parent, int):
 819        file_.write(b':%d\n' % parent)
 820      else:
 821        file_.write(b'%s\n' % parent)
 822    for change in self.file_changes:
 823      change.dump(file_)
 824    if not self.parents and not self.file_changes:
 825      # Workaround a bug in pre-git-2.22 versions of fast-import with
 826      # the get-mark directive.
 827      file_.write(b'\n')
 828    file_.write(b'\n')
 829
 830  def first_parent(self):
 831    """
 832    Return first parent commit
 833    """
 834    if self.parents:
 835      return self.parents[0]
 836    return None
 837
 838  def skip(self, new_id=None):
 839    _SKIPPED_COMMITS.add(self.old_id or self.id)
 840    _GitElementWithId.skip(self, new_id)
 841
 842class Tag(_GitElementWithId):
 843  """
 844  This class defines our representation of annotated tag elements.
 845  """
 846
 847  def __init__(self, ref, from_ref,
 848               tagger_name, tagger_email, tagger_date, tag_msg,
 849               original_id = None):
 850    _GitElementWithId.__init__(self)
 851    self.old_id = self.id
 852
 853    # Denote that this is a tag element
 854    self.type = 'tag'
 855
 856    # Store the name of the tag
 857    self.ref = ref
 858
 859    # Store the entity being tagged (this should be a commit)
 860    self.from_ref = from_ref
 861
 862    # Record original id
 863    self.original_id = original_id
 864
 865    # Store the name of the tagger
 866    self.tagger_name  = tagger_name
 867
 868    # Store the email of the tagger
 869    self.tagger_email = tagger_email
 870
 871    # Store the date
 872    self.tagger_date  = tagger_date
 873
 874    # Store the tag message
 875    self.message = tag_msg
 876
 877  def dump(self, file_):
 878    """
 879    Write this tag element to a file
 880    """
 881
 882    self.dumped = 1
 883
 884    file_.write(b'tag %s\n' % self.ref)
 885    if (write_marks and self.id):
 886      file_.write(b'mark :%d\n' % self.id)
 887    markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else b'from %s\n'
 888    file_.write(markfmt % self.from_ref)
 889    if self.tagger_name:
 890      file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email))
 891      file_.write(self.tagger_date)
 892      file_.write(b'\n')
 893    file_.write(b'data %d\n%s' % (len(self.message), self.message))
 894    file_.write(b'\n')
 895
 896class Progress(_GitElement):
 897  """
 898  This class defines our representation of progress elements. The progress
 899  element only contains a progress message, which is printed by fast-import
 900  when it processes the progress output.
 901  """
 902
 903  def __init__(self, message):
 904    _GitElement.__init__(self)
 905
 906    # Denote that this is a progress element
 907    self.type = 'progress'
 908
 909    # Store the progress message
 910    self.message = message
 911
 912  def dump(self, file_):
 913    """
 914    Write this progress element to a file
 915    """
 916    self.dumped = 1
 917
 918    file_.write(b'progress %s\n' % self.message)
 919    file_.write(b'\n')
 920
 921class Checkpoint(_GitElement):
 922  """
 923  This class defines our representation of checkpoint elements.  These
 924  elements represent events which force fast-import to close the current
 925  packfile, start a new one, and to save out all current branch refs, tags
 926  and marks.
 927  """
 928
 929  def __init__(self):
 930    _GitElement.__init__(self)
 931
 932    # Denote that this is a checkpoint element
 933    self.type = 'checkpoint'
 934
 935  def dump(self, file_):
 936    """
 937    Write this checkpoint element to a file
 938    """
 939    self.dumped = 1
 940
 941    file_.write(b'checkpoint\n')
 942    file_.write(b'\n')
 943
 944class LiteralCommand(_GitElement):
 945  """
 946  This class defines our representation of commands. The literal command
 947  includes only a single line, and is not processed in any special way.
 948  """
 949
 950  def __init__(self, line):
 951    _GitElement.__init__(self)
 952
 953    # Denote that this is a literal element
 954    self.type = 'literal'
 955
 956    # Store the command
 957    self.line = line
 958
 959  def dump(self, file_):
 960    """
 961    Write this progress element to a file
 962    """
 963    self.dumped = 1
 964
 965    file_.write(self.line)
 966
 967class Alias(_GitElement):
 968  """
 969  This class defines our representation of fast-import alias elements.  An
 970  alias element is the setting of one mark to the same sha1sum as another,
 971  usually because the newer mark corresponded to a pruned commit.
 972  """
 973
 974  def __init__(self, ref, to_ref):
 975    _GitElement.__init__(self)
 976    # Denote that this is a reset
 977    self.type = 'alias'
 978
 979    self.ref = ref
 980    self.to_ref = to_ref
 981
 982  def dump(self, file_):
 983    """
 984    Write this reset element to a file
 985    """
 986    self.dumped = 1
 987
 988    file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref))
 989
 990class FastExportParser(object):
 991  """
 992  A class for parsing and handling the output from fast-export. This
 993  class allows the user to register callbacks when various types of
 994  data are encountered in the fast-export output. The basic idea is that,
 995  FastExportParser takes fast-export output, creates the various objects
 996  as it encounters them, the user gets to use/modify these objects via
 997  callbacks, and finally FastExportParser outputs the modified objects
 998  in fast-import format (presumably so they can be used to create a new
 999  repo).
1000  """
1001
1002  def __init__(self,
1003               tag_callback = None,   commit_callback = None,
1004               blob_callback = None,  progress_callback = None,
1005               reset_callback = None, checkpoint_callback = None,
1006               done_callback = None):
1007    # Members below simply store callback functions for the various git
1008    # elements
1009    self._tag_callback        = tag_callback
1010    self._blob_callback       = blob_callback
1011    self._reset_callback      = reset_callback
1012    self._commit_callback     = commit_callback
1013    self._progress_callback   = progress_callback
1014    self._checkpoint_callback = checkpoint_callback
1015    self._done_callback       = done_callback
1016
1017    # Keep track of which refs appear from the export, and which make it to
1018    # the import (pruning of empty commits, renaming of refs, and creating
1019    # new manual objects and inserting them can cause these to differ).
1020    self._exported_refs = set()
1021    self._imported_refs = set()
1022
1023    # A list of the branches we've seen, plus the last known commit they
1024    # pointed to.  An entry in latest_*commit will be deleted if we get a
1025    # reset for that branch.  These are used because of fast-import's weird
1026    # decision to allow having an implicit parent via naming the branch
1027    # instead of requiring branches to be specified via 'from' directives.
1028    self._latest_commit = {}
1029    self._latest_orig_commit = {}
1030
1031    # A handle to the input source for the fast-export data
1032    self._input = None
1033
1034    # A handle to the output file for the output we generate (we call dump
1035    # on many of the git elements we create).
1036    self._output = None
1037
1038    # Stores the contents of the current line of input being parsed
1039    self._currentline = ''
1040
1041    # Tracks LFS objects we have found
1042    self._lfs_object_tracker = None
1043
1044    # Compile some regexes and cache those
1045    self._mark_re = re.compile(br'mark :(\d+)\n$')
1046    self._parent_regexes = {}
1047    parent_regex_rules = (br' :(\d+)\n$', br' ([0-9a-f]{40})\n')
1048    for parent_refname in (b'from', b'merge'):
1049      ans = [re.compile(parent_refname+x) for x in parent_regex_rules]
1050      self._parent_regexes[parent_refname] = ans
1051    self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"')
1052    self._refline_regexes = {}
1053    for refline_name in (b'reset', b'commit', b'tag', b'progress'):
1054      self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$')
1055    self._user_regexes = {}
1056    for user in (b'author', b'committer', b'tagger'):
1057      self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$')
1058
1059  def _advance_currentline(self):
1060    """
1061    Grab the next line of input
1062    """
1063    self._currentline = self._input.readline()
1064
1065  def _parse_optional_mark(self):
1066    """
1067    If the current line contains a mark, parse it and advance to the
1068    next line; return None otherwise
1069    """
1070    mark = None
1071    matches = self._mark_re.match(self._currentline)
1072    if matches:
1073      mark = int(matches.group(1))
1074      self._advance_currentline()
1075    return mark
1076
1077  def _parse_optional_parent_ref(self, refname):
1078    """
1079    If the current line contains a reference to a parent commit, then
1080    parse it and advance the current line; otherwise return None. Note
1081    that the name of the reference ('from', 'merge') must match the
1082    refname arg.
1083    """
1084    orig_baseref, baseref = None, None
1085    rule, altrule = self._parent_regexes[refname]
1086    matches = rule.match(self._currentline)
1087    if matches:
1088      orig_baseref = int(matches.group(1))
1089      # We translate the parent commit mark to what it needs to be in
1090      # our mark namespace
1091      baseref = _IDS.translate(orig_baseref)
1092      self._advance_currentline()
1093    else:
1094      matches = altrule.match(self._currentline)
1095      if matches:
1096        orig_baseref = matches.group(1)
1097        baseref = orig_baseref
1098        self._advance_currentline()
1099    return orig_baseref, baseref
1100
1101  def _parse_optional_filechange(self):
1102    """
1103    If the current line contains a file-change object, then parse it
1104    and advance the current line; otherwise return None. We only care
1105    about file changes of type b'M' and b'D' (these are the only types
1106    of file-changes that fast-export will provide).
1107    """
1108    filechange = None
1109    changetype = self._currentline[0:1]
1110    if changetype == b'M':
1111      (changetype, mode, idnum, path) = self._currentline.split(None, 3)
1112      if idnum[0:1] == b':':
1113        idnum = idnum[1:]
1114      path = path.rstrip(b'\n')
1115      # Check for LFS objects from sources before we might toss this filechange
1116      if mode != b'160000' and self._lfs_object_tracker:
1117        value = int(idnum) if len(idnum) != 40 else idnum
1118        self._lfs_object_tracker.check_file_change_data(value, True)
1119      # We translate the idnum to our id system
1120      if len(idnum) != 40:
1121        idnum = _IDS.translate( int(idnum) )
1122      if idnum is not None:
1123        if path.startswith(b'"'):
1124          path = PathQuoting.dequote(path)
1125        filechange = FileChange(b'M', path, idnum, mode)
1126      else:
1127        filechange = b'skipped'
1128      self._advance_currentline()
1129    elif changetype == b'D':
1130      (changetype, path) = self._currentline.split(None, 1)
1131      path = path.rstrip(b'\n')
1132      if path.startswith(b'"'):
1133        path = PathQuoting.dequote(path)
1134      filechange = FileChange(b'D', path)
1135      self._advance_currentline()
1136    elif changetype == b'R':  # pragma: no cover (now avoid fast-export renames)
1137      rest = self._currentline[2:-1]
1138      if rest.startswith(b'"'):
1139        m = self._quoted_string_re.match(rest)
1140        if not m:
1141          raise SystemExit(_("Couldn't parse rename source"))
1142        orig = PathQuoting.dequote(m.group(0))
1143        new = rest[m.end()+1:]
1144      else:
1145        orig, new = rest.split(b' ', 1)
1146      if new.startswith(b'"'):
1147        new = PathQuoting.dequote(new)
1148      filechange = FileChange(b'R', orig, new)
1149      self._advance_currentline()
1150    return filechange
1151
1152  def _parse_original_id(self):
1153    original_id = self._currentline[len(b'original-oid '):].rstrip()
1154    self._advance_currentline()
1155    return original_id
1156
1157  def _parse_encoding(self):
1158    encoding = self._currentline[len(b'encoding '):].rstrip()
1159    self._advance_currentline()
1160    return encoding
1161
1162  def _parse_ref_line(self, refname):
1163    """
1164    Parses string data (often a branch name) from current-line. The name of
1165    the string data must match the refname arg. The program will crash if
1166    current-line does not match, so current-line will always be advanced if
1167    this method returns.
1168    """
1169    matches = self._refline_regexes[refname].match(self._currentline)
1170    if not matches:
1171      raise SystemExit(_("Malformed %(refname)s line: '%(line)s'") %
1172                       ({'refname': refname, 'line':self._currentline})
1173                       ) # pragma: no cover
1174    ref = matches.group(1)
1175    self._advance_currentline()
1176    return ref
1177
1178  def _parse_user(self, usertype):
1179    """
1180    Get user name, email, datestamp from current-line. Current-line will
1181    be advanced.
1182    """
1183    user_regex = self._user_regexes[usertype]
1184    (name, email, when) = user_regex.match(self._currentline).groups()
1185
1186    self._advance_currentline()
1187    return (name, email, when)
1188
1189  def _parse_data(self):
1190    """
1191    Reads data from _input. Current-line will be advanced until it is beyond
1192    the data.
1193    """
1194    fields = self._currentline.split()
1195    assert fields[0] == b'data'
1196    size = int(fields[1])
1197    data = self._input.read(size)
1198    self._advance_currentline()
1199    if self._currentline == b'\n':
1200      self._advance_currentline()
1201    return data
1202
1203  def _parse_blob(self):
1204    """
1205    Parse input data into a Blob object. Once the Blob has been created, it
1206    will be handed off to the appropriate callbacks. Current-line will be
1207    advanced until it is beyond this blob's data. The Blob will be dumped
1208    to _output once everything else is done (unless it has been skipped by
1209    the callback).
1210    """
1211    # Parse the Blob
1212    self._advance_currentline()
1213    id_ = self._parse_optional_mark()
1214
1215    original_id = None
1216    if self._currentline.startswith(b'original-oid'):
1217      original_id = self._parse_original_id();
1218
1219    data = self._parse_data()
1220    if self._currentline == b'\n':
1221      self._advance_currentline()
1222
1223    # Create the blob
1224    blob = Blob(data, original_id)
1225
1226    # If fast-export text had a mark for this blob, need to make sure this
1227    # mark translates to the blob's true id.
1228    if id_:
1229      blob.old_id = id_
1230      _IDS.record_rename(id_, blob.id)
1231
1232    # Check for LFS objects
1233    if self._lfs_object_tracker:
1234      self._lfs_object_tracker.check_blob_data(data, blob.old_id, True)
1235
1236    # Call any user callback to allow them to use/modify the blob
1237    if self._blob_callback:
1238      self._blob_callback(blob)
1239
1240    # Now print the resulting blob
1241    if not blob.dumped:
1242      blob.dump(self._output)
1243
1244  def _parse_reset(self):
1245    """
1246    Parse input data into a Reset object. Once the Reset has been created,
1247    it will be handed off to the appropriate callbacks. Current-line will
1248    be advanced until it is beyond the reset data. The Reset will be dumped
1249    to _output once everything else is done (unless it has been skipped by
1250    the callback).
1251    """
1252    # Parse the Reset
1253    ref = self._parse_ref_line(b'reset')
1254    self._exported_refs.add(ref)
1255    ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
1256    if self._currentline == b'\n':
1257      self._advance_currentline()
1258
1259    # fast-export likes to print extraneous resets that serve no purpose.
1260    # While we could continue processing such resets, that is a waste of
1261    # resources.  Also, we want to avoid recording that this ref was
1262    # seen in such cases, since this ref could be rewritten to nothing.
1263    if not from_ref:
1264      self._latest_commit.pop(ref, None)
1265      self._latest_orig_commit.pop(ref, None)
1266      return
1267
1268    # Create the reset
1269    reset = Reset(ref, from_ref)
1270
1271    # Call any user callback to allow them to modify the reset
1272    if self._reset_callback:
1273      self._reset_callback(reset)
1274
1275    # Update metadata
1276    self._latest_commit[reset.ref] = reset.from_ref
1277    self._latest_orig_commit[reset.ref] = reset.from_ref
1278
1279    # Now print the resulting reset
1280    if not reset.dumped:
1281      self._imported_refs.add(reset.ref)
1282      reset.dump(self._output)
1283
1284  def _parse_commit(self):
1285    """
1286    Parse input data into a Commit object. Once the Commit has been created,
1287    it will be handed off to the appropriate callbacks. Current-line will
1288    be advanced until it is beyond the commit data. The Commit will be dumped
1289    to _output once everything else is done (unless it has been skipped by
1290    the callback OR the callback has removed all file-changes from the commit).
1291    """
1292    # Parse the Commit. This may look involved, but it's pretty simple; it only
1293    # looks bad because a commit object contains many pieces of data.
1294    branch = self._parse_ref_line(b'commit')
1295    self._exported_refs.add(branch)
1296    id_ = self._parse_optional_mark()
1297
1298    original_id = None
1299    if self._currentline.startswith(b'original-oid'):
1300      original_id = self._parse_original_id();
1301
1302    author_name = None
1303    author_email = None
1304    if self._currentline.startswith(b'author'):
1305      (author_name, author_email, author_date) = self._parse_user(b'author')
1306
1307    (committer_name, committer_email, committer_date) = \
1308      self._parse_user(b'committer')
1309
1310    if not author_name and not author_email:
1311      (author_name, author_email, author_date) = \
1312        (committer_name, committer_email, committer_date)
1313
1314    encoding = None
1315    if self._currentline.startswith(b'encoding '):
1316      encoding = self._parse_encoding()
1317
1318    commit_msg = self._parse_data()
1319
1320    pinfo = [self._parse_optional_parent_ref(b'from')]
1321    # Due to empty pruning, we can have real 'from' and 'merge' lines that
1322    # due to commit rewriting map to a parent of None.  We need to record
1323    # 'from' if its non-None, and we need to parse all 'merge' lines.
1324    while self._currentline.startswith(b'merge '):
1325      pinfo.append(self._parse_optional_parent_ref(b'merge'))
1326    orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)]
1327
1328    # No parents is oddly represented as [None] instead of [], due to the
1329    # special 'from' handling.  Convert it here to a more canonical form.
1330    if parents == [None]:
1331      parents = []
1332    if orig_parents == [None]:
1333      orig_parents = []
1334
1335    # fast-import format is kinda stupid in that it allows implicit parents
1336    # based on the branch name instead of requiring them to be specified by
1337    # 'from' directives.  The only way to get no parent is by using a reset
1338    # directive first, which clears the latest_commit_for_this_branch tracking.
1339    if not orig_parents and self._latest_commit.get(branch):
1340      parents = [self._latest_commit[branch]]
1341    if not orig_parents and self._latest_orig_commit.get(branch):
1342      orig_parents = [self._latest_orig_commit[branch]]
1343
1344    # Get the list of file changes
1345    file_changes = []
1346    file_change = self._parse_optional_filechange()
1347    had_file_changes = file_change is not None
1348    while file_change:
1349      if not (type(file_change) == bytes and file_change == b'skipped'):
1350        file_changes.append(file_change)
1351      file_change = self._parse_optional_filechange()
1352    if self._currentline == b'\n':
1353      self._advance_currentline()
1354
1355    # Okay, now we can finally create the Commit object
1356    commit = Commit(branch,
1357                    author_name,    author_email,    author_date,
1358                    committer_name, committer_email, committer_date,
1359                    commit_msg, file_changes, parents, original_id, encoding)
1360
1361    # If fast-export text had a mark for this commit, need to make sure this
1362    # mark translates to the commit's true id.
1363    if id_:
1364      commit.old_id = id_
1365      _IDS.record_rename(id_, commit.id)
1366
1367    # refs/notes/ put commit-message-related material in blobs, and name their
1368    # files according to the hash of other commits.  That totally messes with
1369    # all normal callbacks; fast-export should really export these as different
1370    # kinds of objects.  Until then, let's just pass these commits through as-is
1371    # and hope the blob callbacks don't mess things up.
1372    if commit.branch.startswith(b'refs/notes/'):
1373      self._imported_refs.add(commit.branch)
1374      commit.dump(self._output)
1375      return
1376
1377    # Call any user callback to allow them to modify the commit
1378    aux_info = {'orig_parents': orig_parents,
1379                'had_file_changes': had_file_changes}
1380    if self._commit_callback:
1381      self._commit_callback(commit, aux_info)
1382
1383    # Now print the resulting commit, or if prunable skip it
1384    self._latest_orig_commit[branch] = commit.id
1385    if not (commit.old_id or commit.id) in _SKIPPED_COMMITS:
1386      self._latest_commit[branch] = commit.id
1387    if not commit.dumped:
1388      self._imported_refs.add(commit.branch)
1389      commit.dump(self._output)
1390
1391  def _parse_tag(self):
1392    """
1393    Parse input data into a Tag object. Once the Tag has been created,
1394    it will be handed off to the appropriate callbacks. Current-line will
1395    be advanced until it is beyond the tag data. The Tag will be dumped
1396    to _output once everything else is done (unless it has been skipped by
1397    the callback).
1398    """
1399    # Parse the Tag
1400    tag = self._parse_ref_line(b'tag')
1401    self._exported_refs.add(b'refs/tags/'+tag)
1402    id_ = self._parse_optional_mark()
1403    ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
1404
1405    original_id = None
1406    if self._currentline.startswith(b'original-oid'):
1407      original_id = self._parse_original_id();
1408
1409    tagger_name, tagger_email, tagger_date = None, None, None
1410    if self._currentline.startswith(b'tagger'):
1411      (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger')
1412    tag_msg = self._parse_data()
1413    if self._currentline == b'\n':
1414      self._advance_currentline()
1415
1416    # Create the tag
1417    tag = Tag(tag, from_ref,
1418              tagger_name, tagger_email, tagger_date, tag_msg,
1419              original_id)
1420
1421    # If fast-export text had a mark for this tag, need to make sure this
1422    # mark translates to the tag's true id.
1423    if id_:
1424      tag.old_id = id_
1425      _IDS.record_rename(id_, tag.id)
1426
1427    # Call any user callback to allow them to modify the tag
1428    if self._tag_callback:
1429      self._tag_callback(tag)
1430
1431    # The tag might not point at anything that still exists (self.from_ref
1432    # will be None if the commit it pointed to and all its ancestors were
1433    # pruned due to being empty)
1434    if tag.from_ref:
1435      # Print out this tag's information
1436      if not tag.dumped:
1437        self._imported_refs.add(b'refs/tags/'+tag.ref)
1438        tag.dump(self._output)
1439    else:
1440      tag.skip()
1441
1442  def _parse_progress(self):
1443    """
1444    Parse input data into a Progress object. Once the Progress has
1445    been created, it will be handed off to the appropriate
1446    callbacks. Current-line will be advanced until it is beyond the
1447    progress data. The Progress will be dumped to _output once
1448    everything else is done (unless it has been skipped by the callback).
1449    """
1450    # Parse the Progress
1451    message = self._parse_ref_line(b'progress')
1452    if self._currentline == b'\n':
1453      self._advance_currentline()
1454
1455    # Create the progress message
1456    progress = Progress(message)
1457
1458    # Call any user callback to allow them to modify the progress messsage
1459    if self._progress_callback:
1460      self._progress_callback(progress)
1461
1462    # NOTE: By default, we do NOT print the progress message; git
1463    # fast-import would write it to fast_import_pipes which could mess with
1464    # our parsing of output from the 'ls' and 'get-mark' directives we send
1465    # to fast-import.  If users want these messages, they need to process
1466    # and handle them in the appropriate callback above.
1467
1468  def _parse_checkpoint(self):
1469    """
1470    Parse input data into a Checkpoint object. Once the Checkpoint has
1471    been created, it will be handed off to the appropriate
1472    callbacks. Current-line will be advanced until it is beyond the
1473    checkpoint data. The Checkpoint will be dumped to _output once
1474    everything else is done (unless it has been skipped by the callback).
1475    """
1476    # Parse the Checkpoint
1477    self._advance_currentline()
1478    if self._currentline == b'\n':
1479      self._advance_currentline()
1480
1481    # Create the checkpoint
1482    checkpoint = Checkpoint()
1483
1484    # Call any user callback to allow them to drop the checkpoint
1485    if self._checkpoint_callback:
1486      self._checkpoint_callback(checkpoint)
1487
1488    # NOTE: By default, we do NOT print the checkpoint message; although it
1489    # we would only realistically get them with --stdin, the fact that we
1490    # are filtering makes me think the checkpointing is less likely to be
1491    # reasonable.  In fact, I don't think it's necessary in general.  If
1492    # users do want it, they should process it in the checkpoint_callback.
1493
1494  def _parse_literal_command(self):
1495    """
1496    Parse literal command.  Then just dump the line as is.
1497    """
1498    # Create the literal command object
1499    command = LiteralCommand(self._currentline)
1500    self._advance_currentline()
1501
1502    # Now print the resulting literal command
1503    if not command.dumped:
1504      command.dump(self._output)
1505
1506  def insert(self, obj):
1507    assert not obj.dumped
1508    obj.dump(self._output)
1509    if type(obj) == Commit:
1510      self._imported_refs.add(obj.branch)
1511    elif type(obj) in (Reset, Tag):
1512      self._imported_refs.add(obj.ref)
1513
1514  def run(self, input, output):
1515    """
1516    This method filters fast export output.
1517    """
1518    # Set input. If no args provided, use stdin.
1519    self._input = input
1520    self._output = output
1521
1522    # Run over the input and do the filtering
1523    self._advance_currentline()
1524    while self._currentline:
1525      if   self._currentline.startswith(b'blob'):
1526        self._parse_blob()
1527      elif self._currentline.startswith(b'reset'):
1528        self._parse_reset()
1529      elif self._currentline.startswith(b'commit'):
1530        self._parse_commit()
1531      elif self._currentline.startswith(b'tag'):
1532        self._parse_tag()
1533      elif self._currentline.startswith(b'progress'):
1534        self._parse_progress()
1535      elif self._currentline.startswith(b'checkpoint'):
1536        self._parse_checkpoint()
1537      elif self._currentline.startswith(b'feature'):
1538        self._parse_literal_command()
1539      elif self._currentline.startswith(b'option'):
1540        self._parse_literal_command()
1541      elif self._currentline.startswith(b'done'):
1542        if self._done_callback:
1543          self._done_callback()
1544        self._parse_literal_command()
1545        # Prevent confusion from others writing additional stuff that'll just
1546        # be ignored
1547        self._output.close()
1548      elif self._currentline.startswith(b'#'):
1549        self._parse_literal_command()
1550      elif self._currentline.startswith(b'get-mark') or \
1551           self._currentline.startswith(b'cat-blob') or \
1552           self._currentline.startswith(b'ls'):
1553        raise SystemExit(_("Unsupported command: '%s'") % self._currentline)
1554      else:
1555        raise SystemExit(_("Could not parse line: '%s'") % self._currentline)
1556
1557  def get_exported_and_imported_refs(self):
1558    return self._exported_refs, self._imported_refs
1559
1560def record_id_rename(old_id, new_id):
1561  """
1562  Register a new translation
1563  """
1564  handle_transitivity = True
1565  _IDS.record_rename(old_id, new_id, handle_transitivity)
1566
1567# Internal globals
1568_IDS = _IDs()
1569_SKIPPED_COMMITS = set()
1570BLOB_HASH_TO_NEW_ID = {}
1571BLOB_NEW_ID_TO_HASH = {}
1572sdr_next_steps = _("""
1573NEXT STEPS FOR YOUR SENSITIVE DATA REMOVAL:
1574  * If you are doing your rewrite in multiple steps, ignore these next steps
1575    until you have completed all your invocations of git-filter-repo.
1576  * See the "Sensitive Data Removal" subsection of the "DISCUSSION" section
1577    of the manual for more details about any of the steps below.
1578  * Inspect this repository and verify that the sensitive data is indeed
1579    completely removed from all commits.
1580  * Force push the rewritten history to the server:
1581      %s
1582  * Contact the server admins for additional steps they need to take; the
1583    First Changed Commit(s)%s may come in handy here.
1584  * Have other colleagues with a clone either discard their clone and reclone
1585    OR follow the detailed steps in the manual to repeatedly rebase and
1586    purge the sensitive data from their copy.  Again, the First Changed
1587    Commit(s)%s may come in handy.
1588  * See the "Prevent repeats and avoid future sensitive data spills" section
1589    of the manual.
1590"""[1:])
1591
1592class SubprocessWrapper(object):
1593  @staticmethod
1594  def decodify(args):
1595    if type(args) == str:
1596      return args
1597    else:
1598      assert type(args) == list
1599      return [decode(x) if type(x)==bytes else x for x in args]
1600
1601  @staticmethod
1602  def call(*args, **kwargs):
1603    if 'cwd' in kwargs:
1604      kwargs['cwd'] = decode(kwargs['cwd'])
1605    return subprocess.call(SubprocessWrapper.decodify(*args), **kwargs)
1606
1607  @staticmethod
1608  def check_output(*args, **kwargs):
1609    if 'cwd' in kwargs:
1610      kwargs['cwd'] = decode(kwargs['cwd'])
1611    return subprocess.check_output(SubprocessWrapper.decodify(*args), **kwargs)
1612
1613  @staticmethod
1614  def check_call(*args, **kwargs): # pragma: no cover  # used by filter-lamely
1615    if 'cwd' in kwargs:
1616      kwargs['cwd'] = decode(kwargs['cwd'])
1617    return subprocess.check_call(SubprocessWrapper.decodify(*args), **kwargs)
1618
1619  @staticmethod
1620  def Popen(*args, **kwargs):
1621    if 'cwd' in kwargs:
1622      kwargs['cwd'] = decode(kwargs['cwd'])
1623    return subprocess.Popen(SubprocessWrapper.decodify(*args), **kwargs)
1624
1625subproc = subprocess
1626if platform.system() == 'Windows' or 'PRETEND_UNICODE_ARGS' in os.environ:
1627  subproc = SubprocessWrapper
1628
1629class GitUtils(object):
1630  @staticmethod
1631  def get_commit_count(repo, *args):
1632    """
1633    Return the number of commits that have been made on repo.
1634    """
1635    if not args:
1636      args = ['--all']
1637    if len(args) == 1 and isinstance(args[0], list):
1638      args = args[0]
1639    p = subproc.Popen(["git", "rev-list", "--count"] + args,
1640                      stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1641                      cwd=repo)
1642    if p.wait() != 0:
1643      raise SystemExit(_("%s does not appear to be a valid git repository")
1644                       % decode(repo))
1645    return int(p.stdout.read())
1646
1647  @staticmethod
1648  def get_total_objects(repo):
1649    """
1650    Return the number of objects (both packed and unpacked)
1651    """
1652    p1 = subproc.Popen(["git", "count-objects", "-v"],
1653                          stdout=subprocess.PIPE, cwd=repo)
1654    lines = p1.stdout.read().splitlines()
1655    # Return unpacked objects + packed-objects
1656    return int(lines[0].split()[1]) + int(lines[2].split()[1])
1657
1658  @staticmethod
1659  def is_repository_bare(repo_working_dir):
1660    out = subproc.check_output('git rev-parse --is-bare-repository'.split(),
1661                               cwd=repo_working_dir)
1662    return (out.strip() == b'true')
1663
1664  @staticmethod
1665  def determine_git_dir(repo_working_dir):
1666    d = subproc.check_output('git rev-parse --git-dir'.split(),
1667                             cwd=repo_working_dir).strip()
1668    if repo_working_dir==b'.' or d.startswith(b'/'):
1669      return d
1670    return os.path.join(repo_working_dir, d)
1671
1672  @staticmethod
1673  def get_refs(repo_working_dir):
1674    try:
1675      output = subproc.check_output('git show-ref'.split(),
1676                                    cwd=repo_working_dir)
1677    except subprocess.CalledProcessError as e:
1678      # If error code is 1, there just aren't any refs; i.e. new repo.
1679      # If error code is other than 1, some other error (e.g. not a git repo)
1680      if e.returncode != 1:
1681        raise SystemExit('fatal: {}'.format(e))
1682      output = ''
1683    return dict(reversed(x.split()) for x in output.splitlines())
1684
1685  @staticmethod
1686  def get_config_settings(repo_working_dir):
1687    output = ''
1688    try:
1689      output = subproc.check_output('git config --list --null'.split(),
1690                                    cwd=repo_working_dir)
1691    except subprocess.CalledProcessError as e: # pragma: no cover
1692      raise SystemExit('fatal: {}'.format(e))
1693
1694    # FIXME: Ignores multi-valued keys, just let them overwrite for now
1695    return dict(item.split(b'\n', maxsplit=1)
1696                for item in output.strip().split(b"\0") if item)
1697
1698  @staticmethod
1699  def get_blob_sizes(quiet = False):
1700    blob_size_progress = ProgressWriter()
1701    num_blobs = 0
1702    processed_blobs_msg = _("Processed %d blob sizes")
1703
1704    # Get sizes of blobs by sha1
1705    cmd = '--batch-check=%(objectname) %(objecttype) ' + \
1706          '%(objectsize) %(objectsize:disk)'
1707    cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
1708                       bufsize = -1,
1709                       stdout = subprocess.PIPE)
1710    unpacked_size = {}
1711    packed_size = {}
1712    for line in cf.stdout:
1713      try:
1714        sha, objtype, objsize, objdisksize = line.split()
1715        objsize, objdisksize = int(objsize), int(objdisksize)
1716        if objtype == b'blob':
1717          unpacked_size[sha] = objsize
1718          packed_size[sha] = objdisksize
1719          num_blobs += 1
1720      except ValueError: # pragma: no cover
1721        sys.stderr.write(_("Error: unexpected `git cat-file` output: \"%s\"\n") % line)
1722      if not quiet:
1723        blob_size_progress.show(processed_blobs_msg % num_blobs)
1724    cf.wait()
1725    if not quiet:
1726      blob_size_progress.finish()
1727    return unpacked_size, packed_size
1728
1729  @staticmethod
1730  def get_file_changes(repo, parent_hash, commit_hash):
1731    """
1732    Return a FileChanges list with the differences between parent_hash
1733    and commit_hash
1734    """
1735    file_changes = []
1736
1737    cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash]
1738    output = subproc.check_output(cmd, cwd=repo)
1739    for line in output.splitlines():
1740      fileinfo, path = line.split(b'\t', 1)
1741      if path.startswith(b'"'):
1742        path = PathQuoting.dequote(path)
1743      oldmode, mode, oldhash, newhash, changetype = fileinfo.split()
1744      if changetype == b'D':
1745        file_changes.append(FileChange(b'D', path))
1746      elif changetype in (b'A', b'M', b'T'):
1747        identifier = BLOB_HASH_TO_NEW_ID.get(newhash, newhash)
1748        file_changes.append(FileChange(b'M', path, identifier, mode))
1749      else: # pragma: no cover
1750        raise SystemExit("Unknown change type for line {}".format(line))
1751
1752    return file_changes
1753
1754  @staticmethod
1755  def print_my_version():
1756    with open(__file__, 'br') as f:
1757      contents = f.read()
1758    # If people replaced @@LOCALEDIR@@ string to point at their local
1759    # directory, undo it so we can get original source version.
1760    contents = re.sub(br'\A#\!.*',
1761                      br'#!/usr/bin/env python3', contents)
1762    contents = re.sub(br'(\("GIT_TEXTDOMAINDIR"\) or ").*"',
1763                      br'\1@@LOCALEDIR@@"', contents)
1764
1765    cmd = 'git hash-object --stdin'.split()
1766    version = subproc.check_output(cmd, input=contents).strip()
1767    print(decode(version[0:12]))
1768
1769class FilteringOptions(object):
1770  default_replace_text = b'***REMOVED***'
1771  class AppendFilter(argparse.Action):
1772    def __call__(self, parser, namespace, values, option_string=None):
1773      user_path = values
1774      suffix = option_string[len('--path-'):] or 'match'
1775      if suffix.startswith('rename'):
1776        mod_type = 'rename'
1777        match_type = option_string[len('--path-rename-'):] or 'match'
1778        values = values.split(b':')
1779        if len(values) != 2:
1780          raise SystemExit(_("Error: --path-rename expects one colon in its"
1781                             " argument: <old_name:new_name>."))
1782        if values[0] and values[1] and not (
1783           values[0].endswith(b'/') == values[1].endswith(b'/')):
1784          raise SystemExit(_("Error: With --path-rename, if OLD_NAME and "
1785                             "NEW_NAME are both non-empty and either ends "
1786                             "with a slash then both must."))
1787        if any(v.startswith(b'/') for v in values):
1788          raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
1789        components = values[0].split(b'/') + values[1].split(b'/')
1790      else:
1791        mod_type = 'filter'
1792        match_type = suffix
1793        components = values.split(b'/')
1794        if values.startswith(b'/'):
1795          raise SystemExit(_("Error: Pathnames cannot begin with a '/'"))
1796      for illegal_path in [b'.', b'..']:
1797        if illegal_path in components:
1798          raise SystemExit(_("Error: Invalid path component '%s' found in '%s'")
1799                           % (decode(illegal_path), decode(user_path)))
1800      if match_type == 'regex':
1801        values = re.compile(values)
1802      items = getattr(namespace, self.dest, []) or []
1803      items.append((mod_type, match_type, values))
1804      if (match_type, mod_type) == ('glob', 'filter'):
1805        if not values.endswith(b'*'):
1806          extension = b'*' if values.endswith(b'/') else b'/*'
1807          items.append((mod_type, match_type, values+extension))
1808      setattr(namespace, self.dest, items)
1809
1810  class HelperFilter(argparse.Action):
1811    def __call__(self, parser, namespace, values, option_string=None):
1812      af = FilteringOptions.AppendFilter(dest='path_changes',
1813                                         option_strings=None)
1814      dirname = values if values[-1:] == b'/' else values+b'/'
1815      if option_string == '--subdirectory-filter':
1816        af(parser, namespace, dirname,     '--path-match')
1817        af(parser, namespace, dirname+b':', '--path-rename')
1818      elif option_string == '--to-subdirectory-filter':
1819        af(parser, namespace, b':'+dirname, '--path-rename')
1820      else:
1821        raise SystemExit(_("Error: HelperFilter given invalid option_string: %s")
1822                         % option_string) # pragma: no cover
1823
1824  class FileWithPathsFilter(argparse.Action):
1825    def __call__(self, parser, namespace, values, option_string=None):
1826      if not namespace.path_changes:
1827        namespace.path_changes = []
1828      namespace.path_changes += FilteringOptions.get_paths_from_file(values)
1829
1830  @staticmethod
1831  def create_arg_parser():
1832    # Include usage in the summary, so we can put the description first
1833    summary = _('''Rewrite (or analyze) repository history
1834
1835    git-filter-repo destructively rewrites history (unless --analyze or
1836    --dry-run are given) according to specified rules.  It refuses to do any
1837    rewriting unless either run from a clean fresh clone, or --force was
1838    given.
1839
1840    Basic Usage:
1841      git-filter-repo --analyze
1842      git-filter-repo [FILTER/RENAME/CONTROL OPTIONS]
1843
1844    See EXAMPLES section for details.
1845    ''').rstrip()
1846
1847    # Provide a long helpful examples section
1848    example_text = _('''CALLBACKS
1849
1850    Most callback functions are of the same general format.  For a command line
1851    argument like
1852      --foo-callback 'BODY'
1853
1854    the following code will be compiled and called:
1855      def foo_callback(foo):
1856        BODY
1857
1858    The exception on callbacks is the --file-info-callback, which will be
1859    discussed further below.
1860
1861    Given the callback style, we can thus make a simple callback to replace
1862    'Jon' with 'John' in author/committer/tagger names:
1863      git filter-repo --name-callback 'return name.replace(b"Jon", b"John")'
1864
1865    To remove all 'Tested-by' tags in commit (or tag) messages:
1866      git filter-repo --message-callback 'return re.sub(br"\\nTested-by:.*", "", message)'
1867
1868    To remove all .DS_Store files:
1869      git filter-repo --filename-callback 'return None if os.path.basename(filename) == b".DS_Store" else filename'
1870
1871    Note that if BODY resolves to a filename, then the contents of that file
1872    will be used as the BODY in the callback function.
1873
1874    The --file-info-callback has a more involved function callback; for it the
1875    following code will be compiled and called:
1876      def file_info_callback(filename, mode, blob_id, value):
1877        BODY
1878
1879    It is designed to be used in cases where filtering depends on both
1880    filename and contents (and maybe mode).  It is called for file changes
1881    other than deletions (since deletions have no file contents to operate
1882    on).  This callback is expected to return a tuple of (filename, mode,
1883    blob_id).  It can make use of the following functions from the value
1884    instance:
1885      value.get_contents_by_identifier(blob_id) -> contents (bytestring)
1886      value.get_size_by_identifier(blob_id) -> size_of_blob (int)
1887      value.insert_file_with_contents(contents) -> blob_id
1888      value.is_binary(contents) -> bool
1889      value.apply_replace_text(contents) -> new_contents (bytestring)
1890    and can read/write the following data member from the value instance:
1891      value.data (dict)
1892
1893    The filename can be used for renaming the file similar to
1894    --filename-callback (or None to drop the change), and mode is one
1895    of b'100644', b'100755', b'120000', or b'160000'.
1896
1897    For more detailed examples and explanations AND caveats, see
1898      https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#CALLBACKS
1899
1900EXAMPLES
1901
1902    To get a bunch of reports mentioning renames that have occurred in
1903    your repo and listing sizes of objects aggregated by any of path,
1904    directory, extension, or blob-id:
1905      git filter-repo --analyze
1906
1907    (These reports can help you choose how to filter your repo; it can
1908    be useful to re-run this command after filtering to regenerate the
1909    report and verify the changes look correct.)
1910
1911    To extract the history that touched just 'guides' and 'tools/releases':
1912      git filter-repo --path guides/ --path tools/releases
1913
1914    To remove foo.zip and bar/baz/zips from every revision in history:
1915      git filter-repo --path foo.zip --path bar/baz/zips/ --invert-paths
1916
1917    To replace the text 'password' with 'p455w0rd':
1918      git filter-repo --replace-text <(echo "password==>p455w0rd")
1919
1920    To use the current version of the .mailmap file to update authors,
1921    committers, and taggers throughout history and make it permanent:
1922      git filter-repo --use-mailmap
1923
1924    To extract the history of 'src/', rename all files to have a new leading
1925    directory 'my-module' (e.g. src/foo.java -> my-module/src/foo.java), and
1926    add a 'my-module-' prefix to all tags:
1927      git filter-repo --path src/ --to-subdirectory-filter my-module --tag-rename '':'my-module-'
1928
1929    For more detailed examples and explanations, see
1930      https://htmlpreview.github.io/?https://github.com/newren/git-filter-repo/blob/docs/html/git-filter-repo.html#EXAMPLES''')
1931
1932    # Create the basic parser
1933    parser = argparse.ArgumentParser(description=summary,
1934                                     usage = argparse.SUPPRESS,
1935                                     add_help = False,
1936                                     epilog = example_text,
1937                                     formatter_class=argparse.RawDescriptionHelpFormatter)
1938
1939    analyze = parser.add_argument_group(title=_("Analysis"))
1940    analyze.add_argument('--analyze', action='store_true',
1941        help=_("Analyze repository history and create a report that may be "
1942               "useful in determining what to filter in a subsequent run. "
1943               "Will not modify your repo."))
1944    analyze.add_argument('--report-dir',
1945        metavar='DIR_OR_FILE',
1946        type=os.fsencode,
1947        dest='report_dir',
1948        help=_("Directory to write report, defaults to GIT_DIR/filter_repo/analysis,"
1949               "refuses to run if exists, --force delete existing dir first."))
1950
1951    path = parser.add_argument_group(title=_("Filtering based on paths "
1952                                             "(see also --filename-callback)"),
1953                                     description=textwrap.dedent(_("""
1954           These options specify the paths to select.  Note that much like git
1955           itself, renames are NOT followed so you may need to specify multiple
1956           paths, e.g. `--path olddir/ --path newdir/`
1957           """[1:])))
1958
1959    path.add_argument('--invert-paths', action='store_false', dest='inclusive',
1960        help=_("Invert the selection of files from the specified "
1961               "--path-{match,glob,regex} options below, i.e. only select "
1962               "files matching none of those options."))
1963
1964    path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE',
1965        type=os.fsencode,
1966        action=FilteringOptions.AppendFilter, dest='path_changes',
1967        help=_("Exact paths (files or directories) to include in filtered "
1968               "history.  Multiple --path options can be specified to get "
1969               "a union of paths."))
1970    path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode,
1971        action=FilteringOptions.AppendFilter, dest='path_changes',
1972        help=_("Glob of paths to include in filtered history. Multiple "
1973               "--path-glob options can be specified to get a union of "
1974               "paths."))
1975    path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode,
1976        action=FilteringOptions.AppendFilter, dest='path_changes',
1977        help=_("Regex of paths to include in filtered history. Multiple "
1978               "--path-regex options can be specified to get a union of "
1979               "paths"))
1980    path.add_argument('--use-base-name', action='store_true',
1981        help=_("Match on file base name instead of full path from the top "
1982               "of the repo.  Incompatible with --path-rename, and "
1983               "incompatible with matching against directory names."))
1984
1985    rename = parser.add_argument_group(title=_("Renaming based on paths "
1986                                             "(see also --filename-callback)"))
1987    rename.add_argument('--path-rename', '--path-rename-match',
1988        metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode,
1989        action=FilteringOptions.AppendFilter,
1990        help=_("Path to rename; if filename or directory matches OLD_NAME "
1991               "rename to NEW_NAME.  Multiple --path-rename options can be "
1992               "specified.  NOTE: If you combine filtering options with "
1993               "renaming ones, do not rely on a rename argument to select "
1994               "paths; you also need a filter to select them."))
1995
1996    helpers = parser.add_argument_group(title=_("Path shortcuts"))
1997    helpers.add_argument('--paths', help=argparse.SUPPRESS, metavar='IGNORE')
1998    helpers.add_argument('--paths-from-file', metavar='FILENAME',
1999        type=os.fsencode,
2000        action=FilteringOptions.FileWithPathsFilter, dest='path_changes',
2001        help=_("Specify several path filtering and renaming directives, one "
2002               "per line.  Lines with '==>' in them specify path renames, "
2003               "and lines can begin with 'literal:' (the default), 'glob:', "
2004               "or 'regex:' to specify different matching styles.  Blank "
2005               "lines and lines starting with a '#' are ignored."))
2006    helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY',
2007        action=FilteringOptions.HelperFilter, type=os.fsencode,
2008        help=_("Only look at history that touches the given subdirectory "
2009               "and treat that directory as the project root.  Equivalent "
2010               "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'"))
2011    helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY',
2012        action=FilteringOptions.HelperFilter, type=os.fsencode,
2013        help=_("Treat the project root as if it were under DIRECTORY. "
2014               "Equivalent to using '--path-rename :DIRECTORY/'"))
2015
2016    contents = parser.add_argument_group(title=_("Content editing filters "
2017                                                 "(see also --blob-callback)"))
2018    contents.add_argument('--replace-text', metavar='EXPRESSIONS_FILE',
2019        help=_("A file with expressions that, if found, will be replaced. "
2020               "By default, each expression is treated as literal text, "
2021               "but 'regex:' and 'glob:' prefixes are supported.  You can "
2022               "end the line with '==>' and some replacement text to "
2023               "choose a replacement choice other than the default of '{}'."
2024               .format(decode(FilteringOptions.default_replace_text))))
2025    contents.add_argument('--strip-blobs-bigger-than', metavar='SIZE',
2026                          dest='max_blob_size', default=0,
2027        help=_("Strip blobs (files) bigger than specified size (e.g. '5M', "
2028               "'2G', etc)"))
2029    contents.add_argument('--strip-blobs-with-ids', metavar='BLOB-ID-FILENAME',
2030        help=_("Read git object ids from each line of the given file, and "
2031               "strip all of them from history"))
2032
2033    refrename = parser.add_argument_group(title=_("Renaming of refs "
2034                                              "(see also --refname-callback)"))
2035    refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode,
2036        help=_("Rename tags starting with OLD to start with NEW.  For "
2037               "example, --tag-rename foo:bar will rename tag foo-1.2.3 "
2038               "to bar-1.2.3; either OLD or NEW can be empty."))
2039
2040    messages = parser.add_argument_group(title=_("Filtering of commit messages "
2041                                               "(see also --message-callback)"))
2042    messages.add_argument('--replace-message', metavar='EXPRESSIONS_FILE',
2043        help=_("A file with expressions that, if found in commit or tag "
2044               "messages, will be replaced. This file uses the same syntax "
2045               "as --replace-text."))
2046    messages.add_argument('--preserve-commit-hashes', action='store_true',
2047        help=_("By default, since commits are rewritten and thus gain new "
2048               "hashes, references to old commit hashes in commit messages "
2049               "are replaced with new commit hashes (abbreviated to the same "
2050               "length as the old reference).  Use this flag to turn off "
2051               "updating commit hashes in commit messages."))
2052    messages.add_argument('--preserve-commit-encoding', action='store_true',
2053        help=_("Do not reencode commit messages into UTF-8.  By default, if "
2054               "the commit object specifies an encoding for the commit "
2055               "message, the message is re-encoded into UTF-8."))
2056
2057    people = parser.add_argument_group(title=_("Filtering of names & emails "
2058                                               "(see also --name-callback "
2059                                               "and --email-callback)"))
2060    people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME',
2061        type=os.fsencode,
2062        help=_("Use specified mailmap file (see git-shortlog(1) for "
2063               "details on the format) when rewriting author, committer, "
2064               "and tagger names and emails.  If the specified file is "
2065               "part of git history, historical versions of the file will "
2066               "be ignored; only the current contents are consulted."))
2067    people.add_argument('--use-mailmap', dest='mailmap',
2068        action='store_const', const=b'.mailmap',
2069        help=_("Same as: '--mailmap .mailmap' "))
2070
2071    parents = parser.add_argument_group(title=_("Parent rewriting"))
2072    parents.add_argument('--replace-refs', default=None,
2073                         choices=['delete-no-add', 'delete-and-add',
2074                                  'update-no-add', 'update-or-add',
2075                                  'update-and-add', 'old-default'],
2076        help=_("How to handle replace refs (see git-replace(1)).  Replace "
2077               "refs can be added during the history rewrite as a way to "
2078               "allow users to pass old commit IDs (from before "
2079               "git-filter-repo was run) to git commands and have git know "
2080               "how to translate those old commit IDs to the new "
2081               "(post-rewrite) commit IDs.  Also, replace refs that existed "
2082               "before the rewrite can either be deleted or updated.  The "
2083               "choices to pass to --replace-refs thus need to specify both "
2084               "what to do with existing refs and what to do with commit "
2085               "rewrites.  Thus 'update-and-add' means to update existing "
2086               "replace refs, and for any commit rewrite (even if already "
2087               "pointed at by a replace ref) add a new refs/replace/ reference "
2088               "to map from the old commit ID to the new commit ID.  The "
2089               "default is update-no-add, meaning update existing replace refs "
2090               "but do not add any new ones.  There is also a special "
2091               "'old-default' option for picking the default used in versions "
2092               "prior to git-filter-repo-2.45, namely 'update-and-add' upon "
2093               "the first run of git-filter-repo in a repository and "
2094               "'update-or-add' if running git-filter-repo again on a "
2095               "repository."))
2096    parents.add_argument('--prune-empty', default='auto',
2097                         choices=['always', 'auto', 'never'],
2098        help=_("Whether to prune empty commits.  'auto' (the default) means "
2099               "only prune commits which become empty (not commits which were "
2100               "empty in the original repo, unless their parent was pruned). "
2101               "When the parent of a commit is pruned, the first non-pruned "
2102               "ancestor becomes the new parent."))
2103    parents.add_argument('--prune-degenerate', default='auto',
2104                         choices=['always', 'auto', 'never'],
2105        help=_("Since merge commits are needed for history topology, they "
2106               "are typically exempt from pruning.  However, they can become "
2107               "degenerate with the pruning of other commits (having fewer "
2108               "than two parents, having one commit serve as both parents, or "
2109               "having one parent as the ancestor of the other.)  If such "
2110               "merge commits have no file changes, they can be pruned.  The "
2111               "default ('auto') is to only prune empty merge commits which "
2112               "become degenerate (not which started as such)."))
2113    parents.add_argument('--no-ff', action='store_true',
2114        help=_("Even if the first parent is or becomes an ancestor of another "
2115               "parent, do not prune it.  This modifies how "
2116               "--prune-degenerate behaves, and may be useful in projects who "
2117               "always use merge --no-ff."))
2118
2119    callback = parser.add_argument_group(title=_("Generic callback code snippets"))
2120    callback.add_argument('--filename-callback', metavar="FUNCTION_BODY_OR_FILE",
2121        help=_("Python code body for processing filenames; see CALLBACKS "
2122               "sections below."))
2123    callback.add_argument('--file-info-callback', metavar="FUNCTION_BODY_OR_FILE",
2124        help=_("Python code body for processing file and metadata; see "
2125               "CALLBACKS sections below."))
2126    callback.add_argument('--message-callback', metavar="FUNCTION_BODY_OR_FILE",
2127        help=_("Python code body for processing messages (both commit "
2128               "messages and tag messages); see CALLBACKS section below."))
2129    callback.add_argument('--name-callback', metavar="FUNCTION_BODY_OR_FILE",
2130        help=_("Python code body for processing names of people; see "
2131               "CALLBACKS section below."))
2132    callback.add_argument('--email-callback', metavar="FUNCTION_BODY_OR_FILE",
2133        help=_("Python code body for processing emails addresses; see "
2134               "CALLBACKS section below."))
2135    callback.add_argument('--refname-callback', metavar="FUNCTION_BODY_OR_FILE",
2136        help=_("Python code body for processing refnames; see CALLBACKS "
2137               "section below."))
2138
2139    callback.add_argument('--blob-callback', metavar="FUNCTION_BODY_OR_FILE",
2140        help=_("Python code body for processing blob objects; see "
2141               "CALLBACKS section below."))
2142    callback.add_argument('--commit-callback', metavar="FUNCTION_BODY_OR_FILE",
2143        help=_("Python code body for processing commit objects; see "
2144               "CALLBACKS section below."))
2145    callback.add_argument('--tag-callback', metavar="FUNCTION_BODY_OR_FILE",
2146        help=_("Python code body for processing tag objects.  Note that "
2147               "lightweight tags have no tag object and are thus not "
2148               "handled by this callback. See CALLBACKS section below."))
2149    callback.add_argument('--reset-callback', metavar="FUNCTION_BODY_OR_FILE",
2150        help=_("Python code body for processing reset objects; see "
2151               "CALLBACKS section below."))
2152
2153    sdr = parser.add_argument_group(title=_("Sensitive Data Removal Handling"))
2154    sdr.add_argument('--sensitive-data-removal', '--sdr', action='store_true',
2155        help=_("This rewrite is intended to remove sensitive data from a "
2156               "repository.  Gather extra information from the rewrite needed "
2157               "to provide additional instructions on how to clean up other "
2158               "copies."))
2159    sdr.add_argument('--no-fetch', action='store_true',
2160        help=_("By default, --sensitive-data-removal will trigger a "
2161               "mirror-like fetch of all refs from origin, discarding local "
2162               "changes, but ensuring that _all_ fetchable refs that hold on "
2163               "to the sensitve data are rewritten.  This flag removes that "
2164               "fetch, risking that other refs continue holding on to the "
2165               "sensitive data.  This option is implied by --partial or any "
2166               "flag that implies --partial."))
2167
2168    desc = _(
2169      "Specifying alternate source or target locations implies --partial,\n"
2170      "except that the normal default for --replace-refs is used.  However,\n"
2171      "unlike normal uses of --partial, this doesn't risk mixing old and new\n"
2172      "history since the old and new histories are in different repositories.")
2173    location = parser.add_argument_group(title=_("Location to filter from/to"),
2174                                         description=desc)
2175    location.add_argument('--source', type=os.fsencode,
2176                          help=_("Git repository to read from"))
2177    location.add_argument('--target', type=os.fsencode,
2178        help=_("Git repository to overwrite with filtered history"))
2179
2180    order = parser.add_argument_group(title=_("Ordering of commits"))
2181    order.add_argument('--date-order', action='store_true',
2182        help=_("Processes commits in commit timestamp order."))
2183
2184    misc = parser.add_argument_group(title=_("Miscellaneous options"))
2185    misc.add_argument('--help', '-h', action='store_true',
2186        help=_("Show this help message and exit."))
2187    misc.add_argument('--version', action='store_true',
2188        help=_("Display filter-repo's version and exit."))
2189    misc.add_argument('--proceed', action='store_true',
2190        help=_("Avoid triggering the no-arguments-specified check."))
2191    misc.add_argument('--force', '-f', action='store_true',
2192        help=_("Rewrite repository history even if the current repo does not "
2193               "look like a fresh clone.  History rewriting is irreversible "
2194               "(and includes immediate pruning of reflogs and old objects), "
2195               "so be cautious about using this flag."))
2196    misc.add_argument('--partial', action='store_true',
2197        help=_("Do a partial history rewrite, resulting in the mixture of "
2198               "old and new history.  This disables rewriting "
2199               "refs/remotes/origin/* to refs/heads/*, disables removing "
2200               "of the 'origin' remote, disables removing unexported refs, "
2201               "disables expiring the reflog, and disables the automatic "
2202               "post-filter gc.  Also, this modifies --tag-rename and "
2203               "--refname-callback options such that instead of replacing "
2204               "old refs with new refnames, it will instead create new "
2205               "refs and keep the old ones around.  Use with caution."))
2206    misc.add_argument('--no-gc', action='store_true',
2207        help=_("Do not run 'git gc' after filtering."))
2208    # WARNING: --refs presents a problem with become-degenerate pruning:
2209    #   * Excluding a commit also excludes its ancestors so when some other
2210    #     commit has an excluded ancestor as a parent we have no way of
2211    #     knowing what it is an ancestor of without doing a special
2212    #     full-graph walk.
2213    misc.add_argument('--refs', nargs='+',
2214        help=_("Limit history rewriting to the specified refs.  Implies "
2215               "--partial.  In addition to the normal caveats of --partial "
2216               "(mixing old and new history, no automatic remapping of "
2217               "refs/remotes/origin/* to refs/heads/*, etc.), this also may "
2218               "cause problems for pruning of degenerate empty merge "
2219               "commits when negative revisions are specified."))
2220
2221    misc.add_argument('--dry-run', action='store_true',
2222        help=_("Do not change the repository.  Run `git fast-export` and "
2223               "filter its output, and save both the original and the "
2224               "filtered version for comparison.  This also disables "
2225               "rewriting commit messages due to not knowing new commit "
2226               "IDs and disables filtering of some empty commits due to "
2227               "inability to query the fast-import backend." ))
2228    misc.add_argument('--debug', action='store_true',
2229        help=_("Print additional information about operations being "
2230               "performed and commands being run.  When used together "
2231               "with --dry-run, also show extra information about what "
2232               "would be run."))
2233    # WARNING: --state-branch has some problems:
2234    #   * It does not work well with manually inserted objects (user creating
2235    #     Blob() or Commit() or Tag() objects and calling
2236    #     RepoFilter.insert(obj) on them).
2237    #   * It does not work well with multiple source or multiple target repos
2238    #   * It doesn't work so well with pruning become-empty commits (though
2239    #     --refs doesn't work so well with it either)
2240    # These are probably fixable, given some work (e.g. re-importing the
2241    # graph at the beginning to get the AncestryGraph right, doing our own
2242    # export of marks instead of using fast-export --export-marks, etc.), but
2243    # for now just hide the option.
2244    misc.add_argument('--state-branch',
2245        #help=_("Enable incremental filtering by saving the mapping of old "
2246        #       "to new objects to the specified branch upon exit, and"
2247        #       "loading that mapping from that branch (if it exists) "
2248        #       "upon startup."))
2249        help=argparse.SUPPRESS)
2250    misc.add_argument('--stdin', action='store_true',
2251        help=_("Instead of running `git fast-export` and filtering its "
2252               "output, filter the fast-export stream from stdin.    The "
2253               "stdin must be in the expected input format (e.g. it needs "
2254               "to include original-oid directives)."))
2255    misc.add_argument('--quiet', action='store_true',
2256        help=_("Pass --quiet to other git commands called"))
2257    return parser
2258
2259  @staticmethod
2260  def sanity_check_args(args):
2261    if args.analyze and args.path_changes:
2262      raise SystemExit(_("Error: --analyze is incompatible with --path* flags; "
2263                         "it's a read-only operation."))
2264    if args.analyze and args.stdin:
2265      raise SystemExit(_("Error: --analyze is incompatible with --stdin."))
2266    # If no path_changes are found, initialize with empty list but mark as
2267    # not inclusive so that all files match
2268    if args.path_changes == None:
2269      args.path_changes = []
2270      args.inclusive = False
2271    else:
2272      # Similarly, if we have no filtering paths, then no path should be
2273      # filtered out.  Based on how newname() works, the easiest way to
2274      # achieve that is setting args.inclusive to False.
2275      if not any(x[0] == 'filter' for x in args.path_changes):
2276        args.inclusive = False
2277      # Also check for incompatible --use-base-name and --path-rename flags.
2278      if args.use_base_name:
2279        if any(x[0] == 'rename' for x in args.path_changes):
2280          raise SystemExit(_("Error: --use-base-name and --path-rename are "
2281                             "incompatible."))
2282    # Also throw some sanity checks on git version here;
2283    # PERF: remove these checks once new enough git versions are common
2284    p = subproc.Popen('git fast-export -h'.split(),
2285                      stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2286    output = p.stdout.read()
2287    if b'--anonymize-map' not in output: # pragma: no cover
2288      global date_format_permissive
2289      date_format_permissive = False
2290    if not any(x in output for x in [b'--mark-tags',b'--[no-]mark-tags']): # pragma: no cover
2291      global write_marks
2292      write_marks = False
2293      if args.state_branch:
2294        # We need a version of git-fast-export with --mark-tags
2295        raise SystemExit(_("Error: need git >= 2.24.0"))
2296    if not any(x in output for x in [b'--reencode',  b'--[no-]reencode']): # pragma: no cover
2297      if args.preserve_commit_encoding:
2298        # We need a version of git-fast-export with --reencode
2299        raise SystemExit(_("Error: need git >= 2.23.0"))
2300      else:
2301        # Set args.preserve_commit_encoding to None which we'll check for later
2302        # to avoid passing --reencode=yes to fast-export (that option was the
2303        # default prior to git-2.23)
2304        args.preserve_commit_encoding = None
2305      # If we don't have fast-exoprt --reencode, we may also be missing
2306      # diff-tree --combined-all-paths, which is even more important...
2307      p = subproc.Popen('git diff-tree -h'.split(),
2308                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2309      output = p.stdout.read()
2310      if b'--combined-all-paths' not in output:
2311        # We need a version of git-diff-tree with --combined-all-paths
2312        raise SystemExit(_("Error: need git >= 2.22.0"))
2313    if args.sensitive_data_removal:
2314      p = subproc.Popen('git cat-file -h'.split(),
2315                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2316      output = p.stdout.read()
2317      if b"--batch-command" not in output: # pragma: no cover
2318        raise SystemExit(_("Error: need git >= 2.36.0"))
2319    # End of sanity checks on git version
2320    if args.max_blob_size:
2321      suffix = args.max_blob_size[-1]
2322      if suffix not in '1234567890':
2323        mult = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
2324        if suffix not in mult:
2325          raise SystemExit(_("Error: could not parse --strip-blobs-bigger-than"
2326                             " argument %s")
2327                           % args.max_blob_size)
2328        args.max_blob_size = int(args.max_blob_size[0:-1]) * mult[suffix]
2329      else:
2330        args.max_blob_size = int(args.max_blob_size)
2331    if args.file_info_callback and (
2332        args.stdin or args.blob_callback or args.filename_callback):
2333      raise SystemExit(_("Error: --file-info-callback is incompatible with "
2334                         "--stdin, --blob-callback,\nand --filename-callback."))
2335
2336  @staticmethod
2337  def get_replace_text(filename):
2338    replace_literals = []
2339    replace_regexes = []
2340    with open(filename, 'br') as f:
2341      for line in f:
2342        line = line.rstrip(b'\r\n')
2343
2344        # Determine the replacement
2345        replacement = FilteringOptions.default_replace_text
2346        if b'==>' in line:
2347          line, replacement = line.rsplit(b'==>', 1)
2348
2349        # See if we need to match via regex
2350        regex = None
2351        if line.startswith(b'regex:'):
2352          regex = line[6:]
2353        elif line.startswith(b'glob:'):
2354          regex = glob_to_regex(line[5:])
2355        if regex:
2356          replace_regexes.append((re.compile(regex), replacement))
2357        else:
2358          # Otherwise, find the literal we need to replace
2359          if line.startswith(b'literal:'):
2360            line = line[8:]
2361          if not line:
2362            continue
2363          replace_literals.append((line, replacement))
2364    return {'literals': replace_literals, 'regexes':  replace_regexes}
2365
2366  @staticmethod
2367  def get_paths_from_file(filename):
2368    new_path_changes = []
2369    with open(filename, 'br') as f:
2370      for line in f:
2371        line = line.rstrip(b'\r\n')
2372
2373        # Skip blank lines
2374        if not line:
2375          continue
2376        # Skip comment lines
2377        if line.startswith(b'#'):
2378          continue
2379
2380        # Determine the replacement
2381        match_type, repl = 'literal', None
2382        if b'==>' in line:
2383          line, repl = line.rsplit(b'==>', 1)
2384
2385        # See if we need to match via regex
2386        match_type = 'match' # a.k.a. 'literal'
2387        if line.startswith(b'regex:'):
2388          match_type = 'regex'
2389          match = re.compile(line[6:])
2390        elif line.startswith(b'glob:'):
2391          match_type = 'glob'
2392          match = line[5:]
2393          if repl:
2394            raise SystemExit(_("Error: In %s, 'glob:' and '==>' are incompatible (renaming globs makes no sense)" % decode(filename)))
2395        else:
2396          if line.startswith(b'literal:'):
2397            match = line[8:]
2398          else:
2399            match = line
2400          if repl is not None:
2401            if match and repl and match.endswith(b'/') != repl.endswith(b'/'):
2402              raise SystemExit(_("Error: When rename directories, if OLDNAME "
2403                                 "and NEW_NAME are both non-empty and either "
2404                                 "ends with a slash then both must."))
2405
2406        # Record the filter or rename
2407        if repl is not None:
2408          new_path_changes.append(['rename', match_type, (match, repl)])
2409        else:
2410          new_path_changes.append(['filter', match_type, match])
2411          if match_type == 'glob' and not match.endswith(b'*'):
2412            extension = b'*' if match.endswith(b'/') else b'/*'
2413            new_path_changes.append(['filter', match_type, match+extension])
2414      return new_path_changes
2415
2416  @staticmethod
2417  def default_options():
2418    return FilteringOptions.parse_args([], error_on_empty = False)
2419
2420  @staticmethod
2421  def parse_args(input_args, error_on_empty = True):
2422    parser = FilteringOptions.create_arg_parser()
2423    if not input_args and error_on_empty:
2424      parser.print_usage()
2425      raise SystemExit(_("No arguments specified."))
2426    args = parser.parse_args(input_args)
2427    if args.help:
2428      parser.print_help()
2429      raise SystemExit()
2430    if args.paths:
2431      raise SystemExit("Error: Option `--paths` unrecognized; did you mean --path or --paths-from-file?")
2432    if args.version:
2433      GitUtils.print_my_version()
2434      raise SystemExit()
2435    FilteringOptions.sanity_check_args(args)
2436    if args.mailmap:
2437      args.mailmap = MailmapInfo(args.mailmap)
2438    if args.replace_text:
2439      args.replace_text = FilteringOptions.get_replace_text(args.replace_text)
2440    if args.replace_message:
2441      args.replace_message = FilteringOptions.get_replace_text(args.replace_message)
2442    if args.strip_blobs_with_ids:
2443      with open(args.strip_blobs_with_ids, 'br') as f:
2444        args.strip_blobs_with_ids = set(f.read().split())
2445    else:
2446      args.strip_blobs_with_ids = set()
2447    if (args.partial or args.refs) and not args.replace_refs:
2448      args.replace_refs = 'update-no-add'
2449    args.repack = not (args.partial or args.refs or args.no_gc)
2450    if args.refs or args.source or args.target:
2451      args.partial = True
2452    if args.partial:
2453      args.no_fetch = True
2454    if not args.refs:
2455      args.refs = ['--all']
2456    return args
2457
2458class RepoAnalyze(object):
2459
2460  # First, several helper functions for analyze_commit()
2461
2462  @staticmethod
2463  def equiv_class(stats, filename):
2464    return stats['equivalence'].get(filename, (filename,))
2465
2466  @staticmethod
2467  def setup_equivalence_for_rename(stats, oldname, newname):
2468    # if A is renamed to B and B is renamed to C, then the user thinks of
2469    # A, B, and C as all being different names for the same 'file'.  We record
2470    # this as an equivalence class:
2471    #   stats['equivalence'][name] = (A,B,C)
2472    # for name being each of A, B, and C.
2473    old_tuple = stats['equivalence'].get(oldname, ())
2474    if newname in old_tuple:
2475      return
2476    elif old_tuple:
2477      new_tuple = tuple(list(old_tuple)+[newname])
2478    else:
2479      new_tuple = (oldname, newname)
2480    for f in new_tuple:
2481      stats['equivalence'][f] = new_tuple
2482
2483  @staticmethod
2484  def setup_or_update_rename_history(stats, commit, oldname, newname):
2485    rename_commits = stats['rename_history'].get(oldname, set())
2486    rename_commits.add(commit)
2487    stats['rename_history'][oldname] = rename_commits
2488
2489  @staticmethod
2490  def handle_renames(stats, commit, change_types, filenames):
2491    for index, change_type in enumerate(change_types):
2492      if change_type == ord(b'R'):
2493        oldname, newname = filenames[index], filenames[-1]
2494        RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
2495        RepoAnalyze.setup_or_update_rename_history(stats, commit,
2496                                                   oldname, newname)
2497
2498  @staticmethod
2499  def handle_file(stats, graph, commit, modes, shas, filenames):
2500    mode, sha, filename = modes[-1], shas[-1], filenames[-1]
2501
2502    # Figure out kind of deletions to undo for this file, and update lists
2503    # of all-names-by-sha and all-filenames
2504    delmode = 'tree_deletions'
2505    if mode != b'040000':
2506      delmode = 'file_deletions'
2507      stats['names'][sha].add(filename)
2508      stats['allnames'].add(filename)
2509
2510    # If the file (or equivalence class of files) was recorded as deleted,
2511    # clearly it isn't anymore
2512    equiv = RepoAnalyze.equiv_class(stats, filename)
2513    for f in equiv:
2514      stats[delmode].pop(f, None)
2515
2516    # If we get a modify/add for a path that was renamed, we may need to break
2517    # the equivalence class.  However, if the modify/add was on a branch that
2518    # doesn't have the rename in its history, we are still okay.
2519    need_to_break_equivalence = False
2520    if equiv[-1] != filename:
2521      for rename_commit in stats['rename_history'][filename]:
2522        if graph.is_ancestor(rename_commit, commit):
2523          need_to_break_equivalence = True
2524
2525    if need_to_break_equivalence:
2526      for f in equiv:
2527        if f in stats['equivalence']:
2528          del stats['equivalence'][f]
2529
2530  @staticmethod
2531  def analyze_commit(stats, graph, commit, parents, date, file_changes):
2532    graph.add_commit_and_parents(commit, parents)
2533    for change in file_changes:
2534      modes, shas, change_types, filenames = change
2535      if len(parents) == 1 and change_types.startswith(b'R'):
2536        change_types = b'R'  # remove the rename score; we don't care
2537      if modes[-1] == b'160000':
2538        continue
2539      elif modes[-1] == b'000000':
2540        # Track when files/directories are deleted
2541        for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
2542          if any(x == b'040000' for x in modes[0:-1]):
2543            stats['tree_deletions'][f] = date
2544          else:
2545            stats['file_deletions'][f] = date
2546      elif change_types.strip(b'AMT') == b'':
2547        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2548      elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'':
2549        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2550      elif change_types.strip(b'RAMT') == b'':
2551        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
2552        RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
2553      else:
2554        raise SystemExit(_("Unhandled change type(s): %(change_type)s "
2555                           "(in commit %(commit)s)")
2556                         % ({'change_type': change_types, 'commit': commit})
2557                         ) # pragma: no cover
2558
2559  @staticmethod
2560  def gather_data(args):
2561    unpacked_size, packed_size = GitUtils.get_blob_sizes()
2562    stats = {'names': collections.defaultdict(set),
2563             'allnames' : set(),
2564             'file_deletions': {},
2565             'tree_deletions': {},
2566             'equivalence': {},
2567             'rename_history': collections.defaultdict(set),
2568             'unpacked_size': unpacked_size,
2569             'packed_size': packed_size,
2570             'num_commits': 0}
2571
2572    # Setup the rev-list/diff-tree process
2573    processed_commits_msg = _("Processed %d commits")
2574    commit_parse_progress = ProgressWriter()
2575    num_commits = 0
2576    cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
2577           ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
2578           ' --date=short -M -t -c --raw --combined-all-paths')
2579    dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
2580    f = dtp.stdout
2581    line = f.readline()
2582    if not line:
2583      raise SystemExit(_("Nothing to analyze; repository is empty."))
2584    cont = bool(line)
2585    graph = AncestryGraph()
2586    while cont:
2587      commit = line.rstrip()
2588      parents = f.readline().split()
2589      date = f.readline().rstrip()
2590
2591      # We expect a blank line next; if we get a non-blank line then
2592      # this commit modified no files and we need to move on to the next.
2593      # If there is no line, we've reached end-of-input.
2594      line = f.readline()
2595      if not line:
2596        cont = False
2597      line = line.rstrip()
2598
2599      # If we haven't reached end of input, and we got a blank line meaning
2600      # a commit that has modified files, then get the file changes associated
2601      # with this commit.
2602      file_changes = []
2603      if cont and not line:
2604        cont = False
2605        for line in f:
2606          if not line.startswith(b':'):
2607            cont = True
2608            break
2609          n = 1+max(1, len(parents))
2610          assert line.startswith(b':'*(n-1))
2611          relevant = line[n-1:-1]
2612          splits = relevant.split(None, n)
2613          modes = splits[0:n]
2614          splits = splits[n].split(None, n)
2615          shas = splits[0:n]
2616          splits = splits[n].split(b'\t')
2617          change_types = splits[0]
2618          filenames = [PathQuoting.dequote(x) for x in splits[1:]]
2619          file_changes.append([modes, shas, change_types, filenames])
2620
2621      # If someone is trying to analyze a subset of the history, make sure
2622      # to avoid dying on commits with parents that we haven't seen before
2623      if args.refs:
2624        graph.record_external_commits([p for p in parents
2625                                       if not p in graph.value])
2626
2627      # Analyze this commit and update progress
2628      RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
2629                                 file_changes)
2630      num_commits += 1
2631      commit_parse_progress.show(processed_commits_msg % num_commits)
2632
2633    # Show the final commits processed message and record the number of commits
2634    commit_parse_progress.finish()
2635    stats['num_commits'] = num_commits
2636
2637    # Close the output, ensure rev-list|diff-tree pipeline completed successfully
2638    dtp.stdout.close()
2639    if dtp.wait():
2640      raise SystemExit(_("Error: rev-list|diff-tree pipeline failed; see above.")) # pragma: no cover
2641
2642    return stats
2643
2644  @staticmethod
2645  def write_report(reportdir, stats):
2646    def datestr(datetimestr):
2647      return datetimestr if datetimestr else _('<present>').encode()
2648
2649    def dirnames(path):
2650      while True:
2651        path = os.path.dirname(path)
2652        yield path
2653        if path == b'':
2654          break
2655
2656    # Compute aggregate size information for paths, extensions, and dirs
2657    total_size = {'packed': 0, 'unpacked': 0}
2658    path_size = {'packed': collections.defaultdict(int),
2659                 'unpacked': collections.defaultdict(int)}
2660    ext_size = {'packed': collections.defaultdict(int),
2661                'unpacked': collections.defaultdict(int)}
2662    dir_size = {'packed': collections.defaultdict(int),
2663                'unpacked': collections.defaultdict(int)}
2664    for sha in stats['names']:
2665      size = {'packed': stats['packed_size'][sha],
2666              'unpacked': stats['unpacked_size'][sha]}
2667      for which in ('packed', 'unpacked'):
2668        for name in stats['names'][sha]:
2669          total_size[which] += size[which]
2670          path_size[which][name] += size[which]
2671          basename, ext = os.path.splitext(name)
2672          ext_size[which][ext] += size[which]
2673          for dirname in dirnames(name):
2674            dir_size[which][dirname] += size[which]
2675
2676    # Determine if and when extensions and directories were deleted
2677    ext_deleted_data = {}
2678    for name in stats['allnames']:
2679      when = stats['file_deletions'].get(name, None)
2680
2681      # Update the extension
2682      basename, ext = os.path.splitext(name)
2683      if when is None:
2684        ext_deleted_data[ext] = None
2685      elif ext in ext_deleted_data:
2686        if ext_deleted_data[ext] is not None:
2687          ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
2688      else:
2689        ext_deleted_data[ext] = when
2690
2691    dir_deleted_data = {}
2692    for name in dir_size['packed']:
2693      dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
2694
2695    with open(os.path.join(reportdir, b"README"), 'bw') as f:
2696      # Give a basic overview of this file
2697      f.write(b"== %s ==\n" % _("Overall Statistics").encode())
2698      f.write(("  %s: %d\n" % (_("Number of commits"),
2699                               stats['num_commits'])).encode())
2700      f.write(("  %s: %d\n" % (_("Number of filenames"),
2701                               len(path_size['packed']))).encode())
2702      f.write(("  %s: %d\n" % (_("Number of directories"),
2703                               len(dir_size['packed']))).encode())
2704      f.write(("  %s: %d\n" % (_("Number of file extensions"),
2705                               len(ext_size['packed']))).encode())
2706      f.write(b"\n")
2707      f.write(("  %s: %d\n" % (_("Total unpacked size (bytes)"),
2708                               total_size['unpacked'])).encode())
2709      f.write(("  %s: %d\n" % (_("Total packed size (bytes)"),
2710                               total_size['packed'])).encode())
2711      f.write(b"\n")
2712
2713      # Mention issues with the report
2714      f.write(("== %s ==\n" % _("Caveats")).encode())
2715      f.write(("=== %s ===\n" % _("Sizes")).encode())
2716      f.write(textwrap.dedent(_("""
2717        Packed size represents what size your repository would be if no
2718        trees, commits, tags, or other metadata were included (though it may
2719        fail to represent de-duplication; see below).  It also represents the
2720        current packing, which may be suboptimal if you haven't gc'ed for a
2721        while.
2722
2723        Unpacked size represents what size your repository would be if no
2724        trees, commits, tags, or other metadata were included AND if no
2725        files were packed; i.e., without delta-ing or compression.
2726
2727        Both unpacked and packed sizes can be slightly misleading.  Deleting
2728        a blob from history not save as much space as the unpacked size,
2729        because it is obviously normally stored in packed form.  Also,
2730        deleting a blob from history may not save as much space as its packed
2731        size either, because another blob could be stored as a delta against
2732        that blob, so when you remove one blob another blob's packed size may
2733        grow.
2734
2735        Also, the sum of the packed sizes can add up to more than the
2736        repository size; if the same contents appeared in the repository in
2737        multiple places, git will automatically de-dupe and store only one
2738        copy, while the way sizes are added in this analysis adds the size
2739        for each file path that has those contents.  Further, if a file is
2740        ever reverted to a previous version's contents, the previous
2741        version's size will be counted multiple times in this analysis, even
2742        though git will only store it once.
2743        """)[1:]).encode())
2744      f.write(b"\n")
2745      f.write(("=== %s ===\n" % _("Deletions")).encode())
2746      f.write(textwrap.dedent(_("""
2747        Whether a file is deleted is not a binary quality, since it can be
2748        deleted on some branches but still exist in others.  Also, it might
2749        exist in an old tag, but have been deleted in versions newer than
2750        that.  More thorough tracking could be done, including looking at
2751        merge commits where one side of history deleted and the other modified,
2752        in order to give a more holistic picture of deletions.  However, that
2753        algorithm would not only be more complex to implement, it'd also be
2754        quite difficult to present and interpret by users.  Since --analyze
2755        is just about getting a high-level rough picture of history, it instead
2756        implements the simplistic rule that is good enough for 98% of cases:
2757          A file is marked as deleted if the last commit in the fast-export
2758          stream that mentions the file lists it as deleted.
2759        This makes it dependent on topological ordering, but generally gives
2760        the "right" answer.
2761        """)[1:]).encode())
2762      f.write(b"\n")
2763      f.write(("=== %s ===\n" % _("Renames")).encode())
2764      f.write(textwrap.dedent(_("""
2765        Renames share the same non-binary nature that deletions do, plus
2766        additional challenges:
2767          * If the renamed file is renamed again, instead of just two names for
2768            a path you can have three or more.
2769          * Rename pairs of the form (oldname, newname) that we consider to be
2770            different names of the "same file" might only be valid over certain
2771            commit ranges.  For example, if a new commit reintroduces a file
2772            named oldname, then new versions of oldname aren't the "same file"
2773            anymore.  We could try to portray this to the user, but it's easier
2774            for the user to just break the pairing and only report unbroken
2775            rename pairings to the user.
2776          * The ability for users to rename files differently in different
2777            branches means that our chains of renames will not necessarily be
2778            linear but may branch out.
2779        """)[1:]).encode())
2780      f.write(b"\n")
2781
2782    # Equivalence classes for names, so if folks only want to keep a
2783    # certain set of paths, they know the old names they want to include
2784    # too.
2785    with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f:
2786      seen = set()
2787      for pathname,equiv_group in sorted(stats['equivalence'].items(),
2788                                         key=lambda x:(x[1], x[0])):
2789        if equiv_group in seen:
2790          continue
2791        seen.add(equiv_group)
2792        f.write(("{} ->\n    ".format(decode(equiv_group[0])) +
2793                     "\n    ".join(decode(x) for x in equiv_group[1:]) +
2794                 "\n").encode())
2795
2796    # List directories in reverse sorted order of unpacked size
2797    with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f:
2798      msg = "=== %s ===\n" % _("Deleted directories by reverse size")
2799      f.write(msg.encode())
2800      msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
2801      f.write(msg.encode())
2802      for dirname, size in sorted(dir_size['packed'].items(),
2803                                  key=lambda x:(x[1],x[0]), reverse=True):
2804        if (dir_deleted_data[dirname]):
2805          f.write(b"  %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
2806                                              size,
2807                                              datestr(dir_deleted_data[dirname]),
2808                                              dirname or _('<toplevel>').encode()))
2809
2810    with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f:
2811      f.write(("=== %s ===\n" % _("All directories by reverse size")).encode())
2812      msg = _("Format: unpacked size, packed size, date deleted, directory name\n")
2813      f.write(msg.encode())
2814      for dirname, size in sorted(dir_size['packed'].items(),
2815                                  key=lambda x:(x[1],x[0]), reverse=True):
2816        f.write(b"  %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname],
2817                                            size,
2818                                            datestr(dir_deleted_data[dirname]),
2819                                            dirname or _("<toplevel>").encode()))
2820
2821    # List extensions in reverse sorted order of unpacked size
2822    with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f:
2823      msg = "=== %s ===\n" % _("Deleted extensions by reverse size")
2824      f.write(msg.encode())
2825      msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
2826      f.write(msg.encode())
2827      for extname, size in sorted(ext_size['packed'].items(),
2828                                  key=lambda x:(x[1],x[0]), reverse=True):
2829        if (ext_deleted_data[extname]):
2830          f.write(b"  %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
2831                                              size,
2832                                              datestr(ext_deleted_data[extname]),
2833                                              extname or _('<no extension>').encode()))
2834
2835    with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f:
2836      f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode())
2837      msg = _("Format: unpacked size, packed size, date deleted, extension name\n")
2838      f.write(msg.encode())
2839      for extname, size in sorted(ext_size['packed'].items(),
2840                                  key=lambda x:(x[1],x[0]), reverse=True):
2841        f.write(b"  %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname],
2842                                            size,
2843                                            datestr(ext_deleted_data[extname]),
2844                                            extname or _('<no extension>').encode()))
2845
2846    # List files in reverse sorted order of unpacked size
2847    with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f:
2848      msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size")
2849      f.write(msg.encode())
2850      msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n")
2851      f.write(msg.encode())
2852      for pathname, size in sorted(path_size['packed'].items(),
2853                                   key=lambda x:(x[1],x[0]), reverse=True):
2854        when = stats['file_deletions'].get(pathname, None)
2855        if when:
2856          f.write(b"  %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
2857                                              size,
2858                                              datestr(when),
2859                                              pathname))
2860
2861    with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f:
2862      msg = "=== %s ===\n" % _("All paths by reverse accumulated size")
2863      f.write(msg.encode())
2864      msg = _("Format: unpacked size, packed size, date deleted, path name\n")
2865      f.write(msg.encode())
2866      for pathname, size in sorted(path_size['packed'].items(),
2867                                   key=lambda x:(x[1],x[0]), reverse=True):
2868        when = stats['file_deletions'].get(pathname, None)
2869        f.write(b"  %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname],
2870                                            size,
2871                                            datestr(when),
2872                                            pathname))
2873
2874    # List of filenames and sizes in descending order
2875    with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f:
2876      f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode())
2877      f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode())
2878      for sha, size in sorted(stats['packed_size'].items(),
2879                              key=lambda x:(x[1],x[0]), reverse=True):
2880        if sha not in stats['names']:
2881          # Some objects in the repository might not be referenced, or not
2882          # referenced by the branches/tags the user cares about; skip them.
2883          continue
2884        names_with_sha = stats['names'][sha]
2885        if len(names_with_sha) == 1:
2886          names_with_sha = names_with_sha.pop()
2887        else:
2888          names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']'
2889        f.write(b"  %s %10d %10d %s\n" % (sha,
2890                                          stats['unpacked_size'][sha],
2891                                          size,
2892                                          names_with_sha))
2893
2894  @staticmethod
2895  def run(args):
2896    if args.report_dir:
2897      reportdir = args.report_dir
2898    else:
2899      git_dir = GitUtils.determine_git_dir(b'.')
2900
2901    # Create the report directory as necessary
2902      results_tmp_dir = os.path.join(git_dir, b'filter-repo')
2903      if not os.path.isdir(results_tmp_dir):
2904        os.mkdir(results_tmp_dir)
2905      reportdir = os.path.join(results_tmp_dir, b"analysis")
2906
2907    if os.path.isdir(reportdir):
2908      if args.force:
2909        sys.stdout.write(_("Warning: Removing recursively: \"%s\"\n") % decode(reportdir))
2910        shutil.rmtree(reportdir)
2911      else:
2912        sys.stdout.write(_("Error: dir already exists (use --force to delete): \"%s\"\n") % decode(reportdir))
2913        sys.exit(1)
2914
2915    os.mkdir(reportdir)
2916
2917    # Gather the data we need
2918    stats = RepoAnalyze.gather_data(args)
2919
2920    # Write the reports
2921    sys.stdout.write(_("Writing reports to \"%s\"...") % decode(reportdir))
2922    sys.stdout.flush()
2923    RepoAnalyze.write_report(reportdir, stats)
2924    sys.stdout.write(_("done.\n"))
2925    sys.stdout.write(_("README: \"%s\"\n") % decode( os.path.join(reportdir, b"README") ))
2926
2927class FileInfoValueHelper:
2928  def __init__(self, replace_text, insert_blob_func, source_working_dir):
2929    self.data = {}
2930    self._replace_text = replace_text
2931    self._insert_blob_func = insert_blob_func
2932    cmd = ['git', 'cat-file', '--batch-command']
2933    self._cat_file_process = subproc.Popen(cmd,
2934                                           stdin = subprocess.PIPE,
2935                                           stdout = subprocess.PIPE,
2936                                           cwd = source_working_dir)
2937
2938  def finalize(self):
2939    self._cat_file_process.stdin.close()
2940    self._cat_file_process.wait()
2941
2942  def get_contents_by_identifier(self, blobhash):
2943    self._cat_file_process.stdin.write(b'contents '+blobhash+b'\n')
2944    self._cat_file_process.stdin.flush()
2945    line = self._cat_file_process.stdout.readline()
2946    try:
2947      (oid, oidtype, size) = line.split()
2948    except ValueError:
2949      assert(line == blobhash+b" missing\n")
2950      return None
2951    size = int(size) # Convert e.g. b'6283' to 6283
2952    assert(oidtype == b'blob')
2953    contents_plus_newline = self._cat_file_process.stdout.read(size+1)
2954    return contents_plus_newline[:-1] # return all but the newline
2955
2956  def get_size_by_identifier(self, blobhash):
2957    self._cat_file_process.stdin.write(b'info '+blobhash+b'\n')
2958    self._cat_file_process.stdin.flush()
2959    line = self._cat_file_process.stdout.readline()
2960    (oid, oidtype, size) = line.split()
2961    size = int(size) # Convert e.g. b'6283' to 6283
2962    assert(oidtype == b'blob')
2963    return size
2964
2965  def insert_file_with_contents(self, contents):
2966    blob = Blob(contents)
2967    self._insert_blob_func(blob)
2968    return blob.id
2969
2970  def is_binary(self, contents):
2971    return b"\0" in contents[0:8192]
2972
2973  def apply_replace_text(self, contents):
2974    new_contents = contents
2975    for literal, replacement in self._replace_text['literals']:
2976      new_contents = new_contents.replace(literal, replacement)
2977    for regex,   replacement in self._replace_text['regexes']:
2978      new_contents = regex.sub(replacement, new_contents)
2979    return new_contents
2980
2981class LFSObjectTracker:
2982  class LFSObjs:
2983    def __init__(self):
2984      self.id_to_object_map = {}
2985      self.objects = set()
2986
2987  def __init__(self, file_info, check_sources, check_targets):
2988    self.source_objects = LFSObjectTracker.LFSObjs()
2989    self.target_objects = LFSObjectTracker.LFSObjs()
2990    self.hash_to_object_map = {}
2991    self.file_info = file_info
2992    self.check_sources = check_sources
2993    self.check_targets = check_targets
2994    self.objects_orphaned = False
2995
2996  def _get_lfs_values(self, contents):
2997    values = {}
2998    if len(contents) > 1024:
2999      return {}
3000    for line in contents.splitlines():
3001      try:
3002        (key, value) = line.split(b' ', 1)
3003      except ValueError:
3004        return {}
3005      if not values and key != b'version':
3006        return values
3007      values[key] = value
3008    return values
3009
3010  def check_blob_data(self, contents, fast_export_id, source):
3011    if source and not self.check_sources:
3012      return
3013    mymap = self.source_objects if source else self.target_objects
3014    lfs_object_id = self._get_lfs_values(contents).get(b'oid')
3015    if lfs_object_id:
3016      mymap.id_to_object_map[fast_export_id] = lfs_object_id
3017
3018  def check_file_change_data(self, git_id, source):
3019    if source and not self.check_sources:
3020      return
3021    mymap = self.source_objects if source else self.target_objects
3022    if isinstance(git_id, int):
3023      lfs_object_id = mymap.id_to_object_map.get(git_id)
3024      if lfs_object_id:
3025        mymap.objects.add(lfs_object_id)
3026    else:
3027      if git_id in self.hash_to_object_map:
3028        mymap.objects.add(self.hash_to_object_map[git_id])
3029        return
3030      size = self.file_info.get_size_by_identifier(git_id)
3031      if size >= 1024:
3032        return
3033      contents = self.file_info.get_contents_by_identifier(git_id)
3034      lfs_object_id = self._get_lfs_values(contents).get(b'oid')
3035      if lfs_object_id:
3036        self.hash_to_object_map[git_id] = lfs_object_id
3037        mymap.objects.add(lfs_object_id)
3038
3039  def check_output_object(self, obj):
3040    if not self.check_targets:
3041      return
3042    if type(obj) == Blob:
3043      self.check_blob_data(obj.data, obj.id, False)
3044    elif type(obj) == Commit:
3045      for change in obj.file_changes:
3046        sys.stdout.flush()
3047        if change.type != b'M' or change.mode == b'160000':
3048          continue
3049        self.check_file_change_data(change.blob_id, False)
3050
3051  def find_all_lfs_objects_in_repo(self, repo, source):
3052    if not source:
3053      self.file_info = FileInfoValueHelper(None, None, repo)
3054    p = subproc.Popen(["git", "rev-list", "--objects", "--all"],
3055                      stdout=subprocess.PIPE, stderr=subprocess.PIPE,
3056                      cwd=repo)
3057    for line in p.stdout.readlines():
3058      try:
3059        (git_oid, filename) = line.split()
3060      except ValueError:
3061        # Commit and tree objects only have oid
3062        continue
3063
3064      mymap = self.source_objects if source else self.target_objects
3065      size = self.file_info.get_size_by_identifier(git_oid)
3066      if size >= 1024:
3067        continue
3068      contents = self.file_info.get_contents_by_identifier(git_oid)
3069      lfs_object_id = self._get_lfs_values(contents).get(b'oid')
3070      if lfs_object_id:
3071        mymap.objects.add(lfs_object_id)
3072    if not source:
3073      self.file_info.finalize()
3074
3075class InputFileBackup:
3076  def __init__(self, input_file, output_file):
3077    self.input_file  = input_file
3078    self.output_file = output_file
3079
3080  def close(self):
3081    self.input_file.close()
3082    self.output_file.close()
3083
3084  def read(self, size):
3085    output = self.input_file.read(size)
3086    self.output_file.write(output)
3087    return output
3088
3089  def readline(self):
3090    line = self.input_file.readline()
3091    self.output_file.write(line)
3092    return line
3093
3094class DualFileWriter:
3095  def __init__(self, file1, file2):
3096    self.file1 = file1
3097    self.file2 = file2
3098
3099  def write(self, *args):
3100    self.file1.write(*args)
3101    self.file2.write(*args)
3102
3103  def flush(self):
3104    self.file1.flush()
3105    self.file2.flush()
3106
3107  def close(self):
3108    self.file1.close()
3109    self.file2.close()
3110
3111class RepoFilter(object):
3112  def __init__(self,
3113               args,
3114               filename_callback = None,
3115               message_callback = None,
3116               name_callback = None,
3117               email_callback = None,
3118               refname_callback = None,
3119               blob_callback = None,
3120               commit_callback = None,
3121               tag_callback = None,
3122               reset_callback = None,
3123               done_callback = None,
3124               file_info_callback = None):
3125
3126    self._args = args
3127
3128    # Repo we are exporting
3129    self._repo_working_dir = None
3130
3131    # Store callbacks for acting on objects printed by FastExport
3132    self._blob_callback        = blob_callback
3133    self._commit_callback      = commit_callback
3134    self._tag_callback         = tag_callback
3135    self._reset_callback       = reset_callback
3136    self._done_callback        = done_callback
3137
3138    # Store callbacks for acting on slices of FastExport objects
3139    self._filename_callback    = filename_callback  # filenames from commits
3140    self._message_callback     = message_callback   # commit OR tag message
3141    self._name_callback        = name_callback      # author, committer, tagger
3142    self._email_callback       = email_callback     # author, committer, tagger
3143    self._refname_callback     = refname_callback   # from commit/tag/reset
3144    self._file_info_callback   = file_info_callback # various file info
3145    self._handle_arg_callbacks()
3146
3147    # Helpers for callbacks
3148    self._file_info_value = None
3149
3150    # Defaults for input
3151    self._input = None
3152    self._fep = None  # Fast Export Process
3153    self._fe_orig = None  # Path to where original fast-export output stored
3154    self._fe_filt = None  # Path to where filtered fast-export output stored
3155    self._parser = None # FastExportParser object we are working with
3156
3157    # Defaults for output
3158    self._output = None
3159    self._fip = None  # Fast Import Process
3160    self._import_pipes = None
3161    self._managed_output = True
3162
3163    # A tuple of (depth, list-of-ancestors).  Commits and ancestors are
3164    # identified by their id (their 'mark' in fast-export or fast-import
3165    # speak).  The depth of a commit is one more than the max depth of any
3166    # of its ancestors.
3167    self._graph = AncestryGraph()
3168    # Another one, for ancestry of commits in the original repo
3169    self._orig_graph = AncestryGraph()
3170
3171    # Names of files that were tweaked in any commit; such paths could lead
3172    # to subsequent commits being empty
3173    self._files_tweaked = set()
3174
3175    # A set of commit hash pairs (oldhash, newhash) which used to be merge
3176    # commits but due to filtering were turned into non-merge commits.
3177    # The commits probably have suboptimal commit messages (e.g. "Merge branch
3178    # next into master").
3179    self._commits_no_longer_merges = []
3180
3181    # A dict of original_ids to new_ids; filtering commits means getting
3182    # new commit hash (sha1sums), and we record the mapping both for
3183    # diagnostic purposes and so we can rewrite commit messages.  Note that
3184    # the new_id can be None rather than a commit hash if the original
3185    # commit became empty and was pruned or was otherwise dropped.
3186    self._commit_renames = {}
3187
3188    # A set of original_ids (i.e. original hashes) for which we have not yet
3189    # gotten the new hashses; the value is always the corresponding fast-export
3190    # id (i.e. commit.id)
3191    self._pending_renames = collections.OrderedDict()
3192
3193    # A dict of commit_hash[0:7] -> set(commit_hashes with that prefix).
3194    #
3195    # It's common for commit messages to refer to commits by abbreviated
3196    # commit hashes, as short as 7 characters.  To facilitate translating
3197    # such short hashes, we have a mapping of prefixes to full old hashes.
3198    self._commit_short_old_hashes = collections.defaultdict(set)
3199
3200    # A set of commit hash references appearing in commit messages which
3201    # mapped to a valid commit that was removed entirely in the filtering
3202    # process.  The commit message will continue to reference the
3203    # now-missing commit hash, since there was nothing to map it to.
3204    self._commits_referenced_but_removed = set()
3205
3206    # Other vars related to metadata tracking
3207    self._already_ran = False
3208    self._changed_refs = set()
3209    self._lfs_object_tracker = None
3210
3211    # Progress handling (number of commits parsed, etc.)
3212    self._progress_writer = ProgressWriter()
3213    self._num_commits = 0
3214
3215    # Size of blobs in the repo
3216    self._unpacked_size = {}
3217
3218    # Other vars
3219    self._sanity_checks_handled = False
3220    self._finalize_handled = False
3221    self._orig_refs = None
3222    self._config_settings = {}
3223    self._newnames = {}
3224    self._stash = None
3225
3226    # Cache a few message translations for performance reasons
3227    self._parsed_message = _("Parsed %d commits")
3228
3229    # Compile some regexes and cache those
3230    self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)')
3231
3232  def _handle_arg_callbacks(self):
3233    def make_callback(args, bdy):
3234      callback_globals = {g: globals()[g] for g in public_globals}
3235      callback_locals = {}
3236      if type(args) == str:
3237        args = (args, '_do_not_use_this_var = None')
3238      exec('def callback({}):\n'.format(', '.join(args))+
3239           '  '+'\n  '.join(bdy.splitlines()), callback_globals, callback_locals)
3240      return callback_locals['callback']
3241    def handle(which, args=None):
3242      which_under = which.replace('-','_')
3243      if not args:
3244        args = which
3245      callback_field = '_{}_callback'.format(which_under)
3246      code_string = getattr(self._args, which_under+'_callback')
3247      if code_string:
3248        if os.path.exists(code_string):
3249          with open(code_string, 'r', encoding='utf-8') as f:
3250            code_string = f.read()
3251        if getattr(self, callback_field):
3252          raise SystemExit(_("Error: Cannot pass a %s_callback to RepoFilter "
3253                             "AND pass --%s-callback"
3254                           % (which_under, which)))
3255        if 'return ' not in code_string and \
3256           which not in ('blob', 'commit', 'tag', 'reset'):
3257          raise SystemExit(_("Error: --%s-callback should have a return statement")
3258                           % which)
3259        setattr(self, callback_field, make_callback(args, code_string))
3260    handle('filename')
3261    handle('message')
3262    handle('name')
3263    handle('email')
3264    handle('refname')
3265    handle('blob')
3266    handle('commit')
3267    handle('tag')
3268    handle('reset')
3269    handle('file-info', ('filename', 'mode', 'blob_id', 'value'))
3270
3271  def _run_sanity_checks(self):
3272    self._sanity_checks_handled = True
3273    if not self._managed_output:
3274      if not self._args.replace_refs:
3275        # If not _managed_output we don't want to make extra changes to the
3276        # repo, so set default to no-op 'update-no-add'
3277        self._args.replace_refs = 'update-no-add'
3278      return
3279
3280    if self._args.debug:
3281      print("[DEBUG] Passed arguments:\n{}".format(self._args))
3282
3283    # Determine basic repository information
3284    target_working_dir = self._args.target or b'.'
3285    self._orig_refs = GitUtils.get_refs(target_working_dir)
3286    is_bare = GitUtils.is_repository_bare(target_working_dir)
3287    self._config_settings = GitUtils.get_config_settings(target_working_dir)
3288
3289    # Determine if this is second or later run of filter-repo
3290    tmp_dir = self.results_tmp_dir(create_if_missing=False)
3291    ran_path = os.path.join(tmp_dir, b'already_ran')
3292    self._already_ran = os.path.isfile(ran_path)
3293    if self._already_ran:
3294      current_time = time.time()
3295      file_mod_time = os.path.getmtime(ran_path)
3296      file_age = current_time - file_mod_time
3297      if file_age > 86400: # file older than a day
3298        msg = (f"The previous run is older than a day ({decode(ran_path)} already exists).\n"
3299               f"See \"Already Ran\" section in the manual for more information.\n"
3300               f"Treat this run as a continuation of filtering in the previous run (Y/N)? ")
3301        response = input(msg)
3302
3303        if response.lower() != 'y':
3304          os.remove(ran_path)
3305          self._already_ran = False
3306
3307    # Interaction between --already-ran and --sensitive_data_removal
3308    msg = textwrap.dedent(_("""\
3309      Error: Cannot specify --sensitive-data-removal on a follow-up invocation
3310             of git-filter-repo unless it was specified in previously runs."""))
3311    if self._already_ran:
3312      sdr_path = os.path.join(tmp_dir, b'sensitive_data_removal')
3313      sdr_previously = os.path.isfile(sdr_path)
3314      if not sdr_previously and self._args.sensitive_data_removal:
3315        raise SystemExit(msg)
3316      # Treat this as a --sensitive-data-removal run if a previous run was,
3317      # even if it wasn't specified this time
3318      self._args.sensitive_data_removal = sdr_previously
3319
3320    # Have to check sensitive_data_removal interactions here instead of
3321    # sanity_check_args because of the above interaction with already_ran stuff
3322    if self._args.sensitive_data_removal:
3323      if self._args.stdin:
3324        msg = _("Error: sensitive data removal is incompatible with --stdin")
3325        raise SystemExit(msg)
3326      if self._args.source or self._args.target:
3327        msg = _("Error: sensitive data removal is incompatible with --source and --target")
3328        raise SystemExit(msg)
3329
3330    # Default for --replace-refs
3331    if not self._args.replace_refs:
3332      self._args.replace_refs = 'delete-no-add'
3333    if self._args.replace_refs == 'old-default':
3334      self._args.replace_refs = ('update-or-add' if self._already_ran
3335                                 else 'update-and-add')
3336
3337    # Do sanity checks from the correct directory
3338    if not self._args.force and not self._already_ran:
3339      cwd = os.getcwd()
3340      os.chdir(target_working_dir)
3341      RepoFilter.sanity_check(self._orig_refs, is_bare, self._config_settings)
3342      os.chdir(cwd)
3343
3344  def _setup_lfs_orphaning_checks(self):
3345    # Do a couple checks to see if we want to do lfs orphaning checks
3346    if not self._args.sensitive_data_removal:
3347      return
3348    metadata_dir = self.results_tmp_dir()
3349    lfs_objects_file = os.path.join(metadata_dir, b'original_lfs_objects')
3350    if self._already_ran:
3351      # Check if we did lfs filtering in the previous run
3352      if not os.path.isfile(lfs_objects_file):
3353        return
3354
3355    # Set up self._file_info_value so we can query git for stuff
3356    source_working_dir = self._args.source or b'.'
3357    self._file_info_value = FileInfoValueHelper(self._args.replace_text,
3358                                                self.insert,
3359                                                source_working_dir)
3360
3361    # One more check to see if we want to do lfs orphaning checks
3362    if not self._already_ran:
3363      # Check if lfs filtering is active in HEAD's .gitattributes file
3364      a = self._file_info_value.get_contents_by_identifier(b"HEAD:.gitattributes")
3365      if not a or not re.search(rb'\bfilter=lfs\b', a):
3366        return
3367
3368    # Set up the object tracker
3369    check_sources = not self._already_ran and not self._args.partial
3370    check_targets = not self._args.partial
3371    self._lfs_object_tracker = LFSObjectTracker(self._file_info_value,
3372                                                check_sources,
3373                                                check_targets)
3374    self._parser._lfs_object_tracker = self._lfs_object_tracker # kinda gross
3375
3376    # Get initial objects
3377    if self._already_ran:
3378      with open(lfs_objects_file, 'br') as f:
3379        for line in f:
3380          self._lfs_object_tracker.source_objects.objects.add(line.strip())
3381    elif self._args.partial:
3382      source = True
3383      self._lfs_object_tracker.find_all_lfs_objects_in_repo(source_working_dir,
3384                                                            source)
3385
3386  @staticmethod
3387  def loose_objects_are_replace_refs(git_dir, refs, num_loose_objects):
3388    replace_objects = set()
3389    for refname, rev in refs.items():
3390      if not refname.startswith(b'refs/replace/'):
3391        continue
3392      replace_objects.add(rev)
3393
3394    validobj_re = re.compile(rb'^[0-9a-f]{40}$')
3395    object_dir=os.path.join(git_dir, b'objects')
3396    for root, dirs, files in os.walk(object_dir):
3397      for filename in files:
3398        objname = os.path.basename(root)+filename
3399        if objname not in replace_objects and validobj_re.match(objname):
3400          return False
3401
3402    return True
3403
3404  @staticmethod
3405  def sanity_check(refs, is_bare, config_settings):
3406    def abort(reason):
3407      dirname = config_settings.get(b'remote.origin.url', b'')
3408      msg = ""
3409      if dirname and os.path.isdir(dirname):
3410        msg = _("Note: when cloning local repositories, you need to pass\n"
3411                "      --no-local to git clone to avoid this issue.\n")
3412      raise SystemExit(
3413        _("Aborting: Refusing to destructively overwrite repo history since\n"
3414          "this does not look like a fresh clone.\n"
3415          "  (%s)\n%s"
3416          "Please operate on a fresh clone instead.  If you want to proceed\n"
3417          "anyway, use --force.") % (reason, msg))
3418
3419    # Avoid letting people running with weird setups and overwriting GIT_DIR
3420    # elsewhere
3421    git_dir = GitUtils.determine_git_dir(b'.')
3422    if is_bare and git_dir != b'.':
3423      abort(_("GIT_DIR must be ."))
3424    elif not is_bare and git_dir != b'.git':
3425      abort(_("GIT_DIR must be .git"))
3426
3427    # Check for refname collisions
3428    if config_settings.get(b'core.ignorecase', b'false') == b'true':
3429      collisions = collections.defaultdict(list)
3430      for ref in refs:
3431        collisions[ref.lower()].append(ref)
3432      msg = ""
3433      for ref in collisions:
3434        if len(collisions[ref]) >= 2:
3435          msg += "    " + decode(b", ".join(collisions[ref])) + "\n"
3436      if msg:
3437        raise SystemExit(
3438          _("Aborting: Cannot rewrite history on a case insensitive\n"
3439            "filesystem since you have refs that differ in case only:\n"
3440            "%s") % msg)
3441    if config_settings.get(b'core.precomposeunicode', b'false') == b'true':
3442      import unicodedata # Mac users need to have python-3.8
3443      collisions = collections.defaultdict(list)
3444      for ref in refs:
3445        strref = decode(ref)
3446        collisions[unicodedata.normalize('NFC', strref)].append(strref)
3447      msg = ""
3448      for ref in collisions:
3449        if len(collisions[ref]) >= 2:
3450          msg += "    " + ", ".join(collisions[ref]) + "\n"
3451      if msg:
3452        raise SystemExit(
3453          _("Aborting: Cannot rewrite history on a character normalizing\n"
3454            "filesystem since you have refs that differ in normalization:\n"
3455            "%s") % msg)
3456
3457    # Make sure repo is fully packed, just like a fresh clone would be.
3458    # Note that transfer.unpackLimit defaults to 100, meaning that a
3459    # repository with no packs and less than 100 objects should be considered
3460    # fully packed.
3461    output = subproc.check_output('git count-objects -v'.split())
3462    stats = dict(x.split(b': ') for x in output.splitlines())
3463    num_packs = int(stats[b'packs'])
3464    num_loose_objects = int(stats[b'count'])
3465    if num_packs > 1 or \
3466       num_loose_objects >= 100 or \
3467       (num_packs == 1 and num_loose_objects > 0 and
3468        not RepoFilter.loose_objects_are_replace_refs(git_dir, refs,
3469                                                      num_loose_objects)):
3470      abort(_("expected freshly packed repo"))
3471
3472    # Make sure there is precisely one remote, named "origin"...or that this
3473    # is a new bare repo with no packs and no remotes
3474    output = subproc.check_output('git remote'.split()).strip()
3475    if not (output == b"origin" or (num_packs == 0 and not output)):
3476      abort(_("expected one remote, origin"))
3477
3478    # Make sure that all reflogs have precisely one entry
3479    reflog_dir=os.path.join(git_dir, b'logs')
3480    for root, dirs, files in os.walk(reflog_dir):
3481      for filename in files:
3482        pathname = os.path.join(root, filename)
3483        with open(pathname, 'br') as f:
3484          if len(f.read().splitlines()) > 1:
3485            shortpath = pathname[len(reflog_dir)+1:]
3486            abort(_("expected at most one entry in the reflog for %s") %
3487                  decode(shortpath))
3488
3489    # Make sure there are no stashed changes
3490    if b'refs/stash' in refs:
3491      abort(_("has stashed changes"))
3492
3493    # Do extra checks in non-bare repos
3494    if not is_bare:
3495      # Avoid uncommitted, unstaged, or untracked changes
3496      if subproc.call('git diff --staged --quiet'.split()):
3497        abort(_("you have uncommitted changes"))
3498      if subproc.call('git diff --quiet'.split()):
3499        abort(_("you have unstaged changes"))
3500      untracked_output = subproc.check_output('git ls-files -o'.split())
3501      if len(untracked_output) > 0:
3502        uf = untracked_output.rstrip(b'\n').split(b'\n')
3503        # Since running git-filter-repo can result in files being written to
3504        # __pycache__ (depending on python version, env vars, etc.), let's
3505        # ignore those as far as "clean clone" is concerned.
3506        relevant_uf = [x for x in uf
3507                       if not x.startswith(b'__pycache__/git_filter_repo.')]
3508        if len(relevant_uf) > 0:
3509          abort(_("you have untracked changes"))
3510
3511      # Avoid unpushed changes
3512      for refname, rev in refs.items():
3513        if not refname.startswith(b'refs/heads/'):
3514          continue
3515        origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/')
3516        if origin_ref not in refs:
3517          abort(_('%s exists, but %s not found') % (decode(refname),
3518                                                    decode(origin_ref)))
3519        if rev != refs[origin_ref]:
3520          abort(_('%s does not match %s') % (decode(refname),
3521                                             decode(origin_ref)))
3522
3523      # Make sure there is only one worktree
3524      output = subproc.check_output('git worktree list'.split())
3525      if len(output.splitlines()) > 1:
3526        abort(_('you have multiple worktrees'))
3527
3528  def cleanup(self, repo, repack, reset,
3529              run_quietly=False, show_debuginfo=False):
3530    ''' cleanup repo; if repack then expire reflogs and do a gc --prune=now.
3531        if reset then do a reset --hard.  Optionally also curb output if
3532        run_quietly is True, or go the opposite direction and show extra
3533        output if show_debuginfo is True. '''
3534    assert not (run_quietly and show_debuginfo)
3535
3536    if (repack and not run_quietly and not show_debuginfo):
3537      print(_("Repacking your repo and cleaning out old unneeded objects"))
3538    quiet_flags = '--quiet' if run_quietly else ''
3539    cleanup_cmds = []
3540    if repack:
3541      cleanup_cmds = ['git reflog expire --expire=now --all'.split(),
3542                      'git gc {} --prune=now'.format(quiet_flags).split()]
3543    if reset:
3544      cleanup_cmds.insert(0, 'git reset {} --hard'.format(quiet_flags).split())
3545    location_info = ' (in {})'.format(decode(repo)) if repo != b'.' else ''
3546    for cmd in cleanup_cmds:
3547      if show_debuginfo:
3548        print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd)))
3549      ret = subproc.call(cmd, cwd=repo)
3550      if ret != 0:
3551        raise SystemExit("fatal: running '%s' failed!" % ' '.join(cmd))
3552      if cmd[0:3] == 'git reflog expire'.split():
3553        self._write_stash()
3554
3555  def _get_rename(self, old_hash):
3556    # If we already know the rename, just return it
3557    new_hash = self._commit_renames.get(old_hash, None)
3558    if new_hash:
3559      return new_hash
3560
3561    # If it's not in the remaining pending renames, we don't know it
3562    if old_hash is not None and old_hash not in self._pending_renames:
3563      return None
3564
3565    # Read through the pending renames until we find it or we've read them all,
3566    # and return whatever we might find
3567    self._flush_renames(old_hash)
3568    return self._commit_renames.get(old_hash, None)
3569
3570  def _flush_renames(self, old_hash=None, limit=0):
3571    # Parse through self._pending_renames until we have read enough.  We have
3572    # read enough if:
3573    #   self._pending_renames is empty
3574    #   old_hash != None and we found a rename for old_hash
3575    #   limit > 0 and len(self._pending_renames) started less than 2*limit
3576    #   limit > 0 and len(self._pending_renames) < limit
3577    if limit and len(self._pending_renames) < 2 * limit:
3578      return
3579    fi_input, fi_output = self._import_pipes
3580    while self._pending_renames:
3581      orig_hash, new_fast_export_id = self._pending_renames.popitem(last=False)
3582      new_hash = fi_output.readline().rstrip()
3583      self._commit_renames[orig_hash] = new_hash
3584      self._graph.record_hash(new_fast_export_id, new_hash)
3585      if old_hash == orig_hash:
3586        return
3587      if limit and len(self._pending_renames) < limit:
3588        return
3589
3590  def _translate_commit_hash(self, matchobj_or_oldhash):
3591    old_hash = matchobj_or_oldhash
3592    if not isinstance(matchobj_or_oldhash, bytes):
3593      old_hash = matchobj_or_oldhash.group(1)
3594    orig_len = len(old_hash)
3595    new_hash = self._get_rename(old_hash)
3596    if new_hash is None:
3597      if old_hash[0:7] not in self._commit_short_old_hashes:
3598        self._commits_referenced_but_removed.add(old_hash)
3599        return old_hash
3600      possibilities = self._commit_short_old_hashes[old_hash[0:7]]
3601      matches = [x for x in possibilities
3602                 if x[0:orig_len] == old_hash]
3603      if len(matches) != 1:
3604        self._commits_referenced_but_removed.add(old_hash)
3605        return old_hash
3606      old_hash = matches[0]
3607      new_hash = self._get_rename(old_hash)
3608
3609    assert new_hash is not None
3610    return new_hash[0:orig_len]
3611
3612  def _maybe_trim_extra_parents(self, orig_parents, parents):
3613    '''Due to pruning of empty commits, some parents could be non-existent
3614       (None) or otherwise redundant.  Remove the non-existent parents, and
3615       remove redundant parents ***SO LONG AS*** that doesn't transform a
3616       merge commit into a non-merge commit.
3617
3618       Returns a tuple:
3619         (parents, new_first_parent_if_would_become_non_merge)'''
3620
3621    always_prune = (self._args.prune_degenerate == 'always')
3622
3623    # Pruning of empty commits means multiple things:
3624    #   * An original parent of this commit may have been pruned causing the
3625    #     need to rewrite the reported parent to the nearest ancestor.  We
3626    #     want to know when we're dealing with such a parent.
3627    #   * Further, there may be no "nearest ancestor" if the entire history
3628    #     of that parent was also pruned.  (Detectable by the parent being
3629    #     'None')
3630    # Remove all parents rewritten to None, and keep track of which parents
3631    # were rewritten to an ancestor.
3632    tmp = zip(parents,
3633              orig_parents,
3634              [(x in _SKIPPED_COMMITS or always_prune) for x in orig_parents])
3635    tmp2 = [x for x in tmp if x[0] is not None]
3636    if not tmp2:
3637      # All ancestors have been pruned; we have no parents.
3638      return [], None
3639    parents, orig_parents, is_rewritten = [list(x) for x in zip(*tmp2)]
3640
3641    # We can't have redundant parents if we don't have at least 2 parents
3642    if len(parents) < 2:
3643      return parents, None
3644
3645    # Don't remove redundant parents if user doesn't want us to
3646    if self._args.prune_degenerate == 'never':
3647      return parents, None
3648
3649    # Remove duplicate parents (if both sides of history have lots of commits
3650    # which become empty due to pruning, the most recent ancestor on both
3651    # sides may be the same commit), except only remove parents that have
3652    # been rewritten due to previous empty pruning.
3653    seen = set()
3654    seen_add = seen.add
3655    # Deleting duplicate rewritten parents means keeping parents if either
3656    # they have not been seen or they are ones that have not been rewritten.
3657    parents_copy = parents
3658    uniq = [[p, orig_parents[i], is_rewritten[i]] for i, p in enumerate(parents)
3659            if not (p in seen or seen_add(p)) or not is_rewritten[i]]
3660    parents, orig_parents, is_rewritten = [list(x) for x in zip(*uniq)]
3661    if len(parents) < 2:
3662      return parents_copy, parents[0]
3663
3664    # Flatten unnecessary merges.  (If one side of history is entirely
3665    # empty commits that were pruned, we may end up attempting to
3666    # merge a commit with its ancestor.  Remove parents that are an
3667    # ancestor of another parent.)
3668    num_parents = len(parents)
3669    to_remove = []
3670    for cur in range(num_parents):
3671      if not is_rewritten[cur]:
3672        continue
3673      for other in range(num_parents):
3674        if cur == other:
3675          continue
3676        if not self._graph.is_ancestor(parents[cur], parents[other]):
3677          continue
3678        # parents[cur] is an ancestor of parents[other], so parents[cur]
3679        # seems redundant.  However, if it was intentionally redundant
3680        # (e.g. a no-ff merge) in the original, then we want to keep it.
3681        if not always_prune and \
3682           self._orig_graph.is_ancestor(orig_parents[cur],
3683                                        orig_parents[other]):
3684          continue
3685        # Some folks want their history to have all first parents be merge
3686        # commits (except for any root commits), and always do a merge --no-ff.
3687        # For such folks, don't remove the first parent even if it's an
3688        # ancestor of other commits.
3689        if self._args.no_ff and cur == 0:
3690          continue
3691        # Okay so the cur-th parent is an ancestor of the other-th parent,
3692        # and it wasn't that way in the original repository; mark the
3693        # cur-th parent as removable.
3694        to_remove.append(cur)
3695        break # cur removed, so skip rest of others -- i.e. check cur+=1
3696    for x in reversed(to_remove):
3697      parents.pop(x)
3698    if len(parents) < 2:
3699      return parents_copy, parents[0]
3700
3701    return parents, None
3702
3703  def _prunable(self, commit, new_1st_parent, had_file_changes, orig_parents):
3704    parents = commit.parents
3705
3706    if self._args.prune_empty == 'never':
3707      return False
3708    always_prune = (self._args.prune_empty == 'always')
3709
3710    # For merge commits, unless there are prunable (redundant) parents, we
3711    # do not want to prune
3712    if len(parents) >= 2 and not new_1st_parent:
3713      return False
3714
3715    if len(parents) < 2:
3716      # Special logic for commits that started empty...
3717      if not had_file_changes and not always_prune:
3718        had_parents_pruned = (len(parents) < len(orig_parents) or
3719                              (len(orig_parents) == 1 and
3720                               orig_parents[0] in _SKIPPED_COMMITS))
3721        # If the commit remains empty and had parents which were pruned,
3722        # then prune this commit; otherwise, retain it
3723        return (not commit.file_changes and had_parents_pruned)
3724
3725      # We can only get here if the commit didn't start empty, so if it's
3726      # empty now, it obviously became empty
3727      if not commit.file_changes:
3728        return True
3729
3730    # If there are no parents of this commit and we didn't match the case
3731    # above, then this commit cannot be pruned.  Since we have no parent(s)
3732    # to compare to, abort now to prevent future checks from failing.
3733    if not parents:
3734      return False
3735
3736    # Similarly, we cannot handle the hard cases if we don't have a pipe
3737    # to communicate with fast-import
3738    if not self._import_pipes:
3739      return False
3740
3741    # If there have not been renames/remappings of IDs (due to insertion of
3742    # new blobs), then we can sometimes know things aren't prunable with a
3743    # simple check
3744    if not _IDS.has_renames():
3745      # non-merge commits can only be empty if blob/file-change editing caused
3746      # all file changes in the commit to have the same file contents as
3747      # the parent.
3748      changed_files = set(change.filename for change in commit.file_changes)
3749      if len(orig_parents) < 2 and changed_files - self._files_tweaked:
3750        return False
3751
3752    # Finally, the hard case: due to either blob rewriting, or due to pruning
3753    # of empty commits wiping out the first parent history back to the merge
3754    # base, the list of file_changes we have may not actually differ from our
3755    # (new) first parent's version of the files, i.e. this would actually be
3756    # an empty commit.  Check by comparing the contents of this commit to its
3757    # (remaining) parent.
3758    #
3759    # NOTE on why this works, for the case of original first parent history
3760    # having been pruned away due to being empty:
3761    #     The first parent history having been pruned away due to being
3762    #     empty implies the original first parent would have a tree (after
3763    #     filtering) that matched the merge base's tree.  Since
3764    #     file_changes has the changes needed to go from what would have
3765    #     been the first parent to our new commit, and what would have been
3766    #     our first parent has a tree that matches the merge base, then if
3767    #     the new first parent has a tree matching the versions of files in
3768    #     file_changes, then this new commit is empty and thus prunable.
3769    fi_input, fi_output = self._import_pipes
3770    self._flush_renames()  # Avoid fi_output having other stuff present
3771    # Optimization note: we could have two loops over file_changes, the
3772    # first doing all the self._output.write() calls, and the second doing
3773    # the rest.  But I'm worried about fast-import blocking on fi_output
3774    # buffers filling up so I instead read from it as I go.
3775    for change in commit.file_changes:
3776      parent = new_1st_parent or commit.parents[0] # exists due to above checks
3777      quoted_filename = PathQuoting.enquote(change.filename)
3778      if isinstance(parent, int):
3779        self._output.write(b"ls :%d %s\n" % (parent, quoted_filename))
3780      else:
3781        self._output.write(b"ls %s %s\n" % (parent, quoted_filename))
3782      self._output.flush()
3783      parent_version = fi_output.readline().split()
3784      if change.type == b'D':
3785        if parent_version != [b'missing', quoted_filename]:
3786          return False
3787      else:
3788        blob_sha = change.blob_id
3789        if isinstance(change.blob_id, int):
3790          self._output.write(b"get-mark :%d\n" % change.blob_id)
3791          self._output.flush()
3792          blob_sha = fi_output.readline().rstrip()
3793        if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]:
3794          return False
3795
3796    return True
3797
3798  def _record_remapping(self, commit, orig_parents):
3799    new_id = None
3800    # Record the mapping of old commit hash to new one
3801    if commit.original_id and self._import_pipes:
3802      fi_input, fi_output = self._import_pipes
3803      self._output.write(b"get-mark :%d\n" % commit.id)
3804      self._output.flush()
3805      orig_id = commit.original_id
3806      self._commit_short_old_hashes[orig_id[0:7]].add(orig_id)
3807      # Note that we have queued up an id for later reading; flush a
3808      # few of the older ones if we have too many queued up
3809      self._pending_renames[orig_id] = commit.id
3810      self._flush_renames(None, limit=40)
3811    # Also, record if this was a merge commit that turned into a non-merge
3812    # commit.
3813    if len(orig_parents) >= 2 and len(commit.parents) < 2:
3814      self._commits_no_longer_merges.append((commit.original_id, new_id))
3815
3816  def callback_metadata(self, extra_items = dict()):
3817    return {'commit_rename_func': self._translate_commit_hash,
3818            'ancestry_graph': self._graph,
3819            'original_ancestry_graph': self._orig_graph,
3820            **extra_items}
3821
3822  def _tweak_blob(self, blob):
3823    if self._args.max_blob_size and len(blob.data) > self._args.max_blob_size:
3824      blob.skip()
3825
3826    if blob.original_id in self._args.strip_blobs_with_ids:
3827      blob.skip()
3828
3829    if ( self._args.replace_text
3830        and not self._file_info_callback
3831        # not (if blob contains zero byte in the first 8Kb, that is, if blob is binary data)
3832        and not b"\0" in blob.data[0:8192]
3833    ):
3834      for literal, replacement in self._args.replace_text['literals']:
3835        blob.data = blob.data.replace(literal, replacement)
3836      for regex,   replacement in self._args.replace_text['regexes']:
3837        blob.data = regex.sub(replacement, blob.data)
3838
3839    if self._blob_callback:
3840      self._blob_callback(blob, self.callback_metadata())
3841
3842    self._insert_into_stream(blob)
3843
3844  def _filter_files(self, commit):
3845    def filename_matches(path_expression, pathname):
3846      ''' Returns whether path_expression matches pathname or a leading
3847          directory thereof, allowing path_expression to not have a trailing
3848          slash even if it is meant to match a leading directory. '''
3849      if path_expression == b'':
3850        return True
3851      n = len(path_expression)
3852      if (pathname.startswith(path_expression) and
3853          (path_expression[n-1:n] == b'/' or
3854           len(pathname) == n or
3855           pathname[n:n+1] == b'/')):
3856        return True
3857      return False
3858
3859    def newname(path_changes, pathname, use_base_name, filtering_is_inclusive):
3860      ''' Applies filtering and rename changes from path_changes to pathname,
3861          returning any of None (file isn't wanted), original filename (file
3862          is wanted with original name), or new filename. '''
3863      wanted = False
3864      full_pathname = pathname
3865      if use_base_name:
3866        pathname = os.path.basename(pathname)
3867      for (mod_type, match_type, path_exp) in path_changes:
3868        if mod_type == 'filter' and not wanted:
3869          assert match_type in ('match', 'glob', 'regex')
3870          if match_type == 'match' and filename_matches(path_exp, pathname):
3871            wanted = True
3872          if match_type == 'glob' and fnmatch.fnmatch(pathname, path_exp):
3873            wanted = True
3874          if match_type == 'regex' and path_exp.search(pathname):
3875            wanted = True
3876        elif mod_type == 'rename':
3877          match, repl = path_exp
3878          assert match_type in ('match','regex') # glob was translated to regex
3879          if match_type == 'match' and filename_matches(match, full_pathname):
3880            full_pathname = full_pathname.replace(match, repl, 1)
3881            pathname = full_pathname # rename incompatible with use_base_name
3882          if match_type == 'regex':
3883            full_pathname = match.sub(repl, full_pathname)
3884            pathname = full_pathname # rename incompatible with use_base_name
3885      return full_pathname if (wanted == filtering_is_inclusive) else None
3886
3887    args = self._args
3888    new_file_changes = {}  # Assumes no renames or copies, otherwise collisions
3889    for change in commit.file_changes:
3890      # NEEDSWORK: _If_ we ever want to pass `--full-tree` to fast-export and
3891      # parse that output, we'll need to modify this block; `--full-tree`
3892      # issues a deleteall directive which has no filename, and thus this
3893      # block would normally strip it.  Of course, FileChange() and
3894      # _parse_optional_filechange() would need updates too.
3895      if change.type == b'DELETEALL':
3896        new_file_changes[b''] = change
3897        continue
3898      if change.filename in self._newnames:
3899        change.filename = self._newnames[change.filename]
3900      else:
3901        original_filename = change.filename
3902        change.filename = newname(args.path_changes, change.filename,
3903                                  args.use_base_name, args.inclusive)
3904        if self._filename_callback:
3905          change.filename = self._filename_callback(change.filename)
3906        self._newnames[original_filename] = change.filename
3907      if not change.filename:
3908        continue # Filtering criteria excluded this file; move on to next one
3909      if change.filename in new_file_changes:
3910        # Getting here means that path renaming is in effect, and caused one
3911        # path to collide with another.  That's usually bad, but can be okay
3912        # under two circumstances:
3913        #   1) Sometimes people have a file named OLDFILE in old revisions of
3914        #      history, and they rename to NEWFILE, and would like to rewrite
3915        #      history so that all revisions refer to it as NEWFILE.  As such,
3916        #      we can allow a collision when (at least) one of the two paths
3917        #      is a deletion.  Note that if OLDFILE and NEWFILE are unrelated
3918        #      this also allows the rewrite to continue, which makes sense
3919        #      since OLDFILE is no longer in the way.
3920        #   2) If OLDFILE and NEWFILE are exactly equal, then writing them
3921        #      both to the same location poses no problem; we only need one
3922        #      file.  (This could come up if someone copied a file in some
3923        #      commit, then later either deleted the file or kept it exactly
3924        #      in sync with the original with any changes, and then decides
3925        #      they want to rewrite history to only have one of the two files)
3926        colliding_change = new_file_changes[change.filename]
3927        if change.type == b'D':
3928          # We can just throw this one away and keep the other
3929          continue
3930        elif change.type == b'M' and (
3931            change.mode == colliding_change.mode and
3932            change.blob_id == colliding_change.blob_id):
3933          # The two are identical, so we can throw this one away and keep other
3934          continue
3935        elif new_file_changes[change.filename].type != b'D':
3936          raise SystemExit(_("File renaming caused colliding pathnames!\n") +
3937                           _("  Commit: {}\n").format(commit.original_id) +
3938                           _("  Filename: {}").format(change.filename))
3939      # Strip files that are too large
3940      if self._args.max_blob_size and \
3941         self._unpacked_size.get(change.blob_id, 0) > self._args.max_blob_size:
3942        continue
3943      if self._args.strip_blobs_with_ids and \
3944         change.blob_id in self._args.strip_blobs_with_ids:
3945        continue
3946      # Otherwise, record the change
3947      new_file_changes[change.filename] = change
3948    commit.file_changes = [v for k,v in sorted(new_file_changes.items())]
3949
3950  def _tweak_commit(self, commit, aux_info):
3951    if self._args.replace_message:
3952      for literal, replacement in self._args.replace_message['literals']:
3953        commit.message = commit.message.replace(literal, replacement)
3954      for regex,   replacement in self._args.replace_message['regexes']:
3955        commit.message = regex.sub(replacement, commit.message)
3956    if self._message_callback:
3957      commit.message = self._message_callback(commit.message)
3958
3959    # Change the commit message according to callback
3960    if not self._args.preserve_commit_hashes:
3961      commit.message = self._hash_re.sub(self._translate_commit_hash,
3962                                         commit.message)
3963
3964    # Change the author & committer according to mailmap rules
3965    args = self._args
3966    if args.mailmap:
3967      commit.author_name, commit.author_email = \
3968          args.mailmap.translate(commit.author_name, commit.author_email)
3969      commit.committer_name, commit.committer_email = \
3970          args.mailmap.translate(commit.committer_name, commit.committer_email)
3971    # Change author & committer according to callbacks
3972    if self._name_callback:
3973      commit.author_name = self._name_callback(commit.author_name)
3974      commit.committer_name = self._name_callback(commit.committer_name)
3975    if self._email_callback:
3976      commit.author_email = self._email_callback(commit.author_email)
3977      commit.committer_email = self._email_callback(commit.committer_email)
3978
3979    # Sometimes the 'branch' given is a tag; if so, rename it as requested so
3980    # we don't get any old tagnames
3981    if self._args.tag_rename:
3982      commit.branch = RepoFilter._do_tag_rename(args.tag_rename, commit.branch)
3983    if self._refname_callback:
3984      commit.branch = self._refname_callback(commit.branch)
3985
3986    # Filter or rename the list of file changes
3987    orig_file_changes = set(commit.file_changes)
3988    self._filter_files(commit)
3989
3990    # Record ancestry graph
3991    parents, orig_parents = commit.parents, aux_info['orig_parents']
3992    if self._args.state_branch:
3993      external_parents = parents
3994    else:
3995      external_parents = [p for p in parents if not isinstance(p, int)]
3996    # The use of 'reversed' is intentional here; there is a risk that we have
3997    # duplicates in parents, and we want to map from parents to the first
3998    # entry we find in orig_parents in such cases.
3999    parent_reverse_dict = dict(zip(reversed(parents), reversed(orig_parents)))
4000
4001    self._graph.record_external_commits(external_parents)
4002    self._orig_graph.record_external_commits(external_parents)
4003    self._graph.add_commit_and_parents(commit.id, parents) # new githash unknown
4004    self._orig_graph.add_commit_and_parents(commit.old_id, orig_parents,
4005                                            commit.original_id)
4006
4007    # Prune parents (due to pruning of empty commits) if relevant, note that
4008    # new_1st_parent is None unless this was a merge commit that is becoming
4009    # a non-merge
4010    prev_1st_parent = parents[0] if parents else None
4011    parents, new_1st_parent = self._maybe_trim_extra_parents(orig_parents,
4012                                                             parents)
4013    commit.parents = parents
4014
4015    # If parents were pruned, then we need our file changes to be relative
4016    # to the new first parent
4017    #
4018    # Notes:
4019    #   * new_1st_parent and new_1st_parent != parents[0] uniquely happens for example when:
4020    #       working on merge, selecting subset of files and merge base still
4021    #       valid while first parent history doesn't touch any of those paths,
4022    #       but second parent history does.  prev_1st_parent had already been
4023    #       rewritten to the non-None first ancestor and it remains valid.
4024    #       self._maybe_trim_extra_parents() avoids removing this first parent
4025    #       because it'd make the commit a non-merge.  However, if there are
4026    #       no file_changes of note, we'll drop this commit and mark
4027    #       new_1st_parent as the new replacement.  To correctly determine if
4028    #       there are no file_changes of note, we need to have the list of
4029    #       file_changes relative to new_1st_parent.
4030    #       (See t9390#3, "basic -> basic-ten using '--path ten'")
4031    #   * prev_1st_parent != parents[0] happens for example when:
4032    #       similar to above, but the merge base is no longer valid and was
4033    #       pruned away as well.  Then parents started as e.g. [None, $num],
4034    #       and both prev_1st_parent and new_1st_parent are None, while parents
4035    #       after self._maybe_trim_extra_parents() becomes just [$num].
4036    #       (See t9390#67, "degenerate merge with non-matching filename".)
4037    #       Since $num was originally a second parent, we need to rewrite
4038    #       file changes to be relative to parents[0].
4039    #   * TODO: We should be getting the changes relative to the new first
4040    #     parent even if self._fep is None, BUT we can't.  Our method of
4041    #     getting the changes right now is an external git diff invocation,
4042    #     which we can't do if we just have a fast export stream.  We can't
4043    #     really work around it by querying the fast-import stream either,
4044    #     because the 'ls' directive only allows us to list info about
4045    #     specific paths, but we need to find out which paths exist in two
4046    #     commits and then query them.  We could maybe force checkpointing in
4047    #     fast-import, then doing a diff from what'll be the new first parent
4048    #     back to prev_1st_parent (which may be None, i.e. empty tree), using
4049    #     the fact that in A->{B,C}->D, where D is merge of B & C, the diff
4050    #     from C->D == C->A + A->B + B->D, and in these cases A==B, so it
4051    #     simplifies to C->D == C->A + B->D, and C is our new 1st parent
4052    #     commit, A is prev_1st_commit, and B->D is commit.file_changes that
4053    #     we already have.  However, checkpointing the fast-import process
4054    #     and figuring out how long to wait before we can run our diff just
4055    #     seems excessive. For now, just punt and assume the merge wasn't
4056    #     "evil" (i.e. that it's remerge-diff is empty, as is true for most
4057    #     merges).  If the merge isn't evil, no further steps are necessary.
4058    if parents and self._fep and (
4059        prev_1st_parent != parents[0] or
4060        new_1st_parent and new_1st_parent != parents[0]):
4061      # Get the id from the original fast export stream corresponding to the
4062      # new 1st parent.  As noted above, that new 1st parent might be
4063      # new_1st_parent, or if that is None, it'll be parents[0].
4064      will_be_1st = new_1st_parent or parents[0]
4065      old_id = parent_reverse_dict[will_be_1st]
4066      # Now, translate that to a hash
4067      will_be_1st_commit_hash = self._orig_graph.map_to_hash(old_id)
4068      # Get the changes from what is going to be the new 1st parent to this
4069      # merge commit.  Note that since we are going from the new 1st parent
4070      # to the merge commit, we can just replace the existing
4071      # commit.file_changes rather than getting something we need to combine
4072      # with the existing commit.file_changes.  Also, we can just replace
4073      # because prev_1st_parent is an ancestor of will_be_1st_commit_hash
4074      # (or prev_1st_parent is None and first parent history is gone), so
4075      # even if we retain prev_1st_parent and do not prune it, the changes
4076      # will still work given the snapshot-based way fast-export/fast-import
4077      # work.
4078      commit.file_changes = GitUtils.get_file_changes(self._repo_working_dir,
4079                                                      will_be_1st_commit_hash,
4080                                                      commit.original_id)
4081
4082      # Save these and filter them
4083      orig_file_changes = set(commit.file_changes)
4084      self._filter_files(commit)
4085
4086    # Process the --file-info-callback
4087    if self._file_info_callback:
4088      if self._file_info_value is None:
4089        source_working_dir = self._args.source or b'.'
4090        self._file_info_value = FileInfoValueHelper(self._args.replace_text,
4091                                                    self.insert,
4092                                                    source_working_dir)
4093      new_file_changes = []
4094      for change in commit.file_changes:
4095        if change.type != b'D':
4096          assert(change.type == b'M')
4097          (filename, mode, blob_id) = \
4098            self._file_info_callback(change.filename,
4099                                     change.mode,
4100                                     change.blob_id,
4101                                     self._file_info_value)
4102          if mode is None:
4103            # TODO: Should deletion of the file even be a feature?  Might
4104            # want to remove this branch of the if-elif-else.
4105            assert(filename is not None)
4106            assert(blob_id is not None)
4107            new_change = FileChange(b'D', filename)
4108          elif filename is None:
4109            continue # Drop the FileChange from this commit
4110          else:
4111            new_change = FileChange(b'M', filename, blob_id, mode)
4112        else:
4113          new_change = change  # use change as-is for deletions
4114        new_file_changes.append(new_change)
4115      commit.file_changes = new_file_changes
4116
4117    # Call the user-defined callback, if any
4118    if self._commit_callback:
4119      self._commit_callback(commit, self.callback_metadata(aux_info))
4120
4121    # Find out which files were modified by the callbacks.  Such paths could
4122    # lead to subsequent commits being empty (e.g. if removing a line containing
4123    # a password from every version of a file that had the password, and some
4124    # later commit did nothing more than remove that line)
4125    final_file_changes = set(commit.file_changes)
4126    if self._args.replace_text or self._blob_callback:
4127      differences = orig_file_changes.union(final_file_changes)
4128    else:
4129      differences = orig_file_changes.symmetric_difference(final_file_changes)
4130    self._files_tweaked.update(x.filename for x in differences)
4131
4132    # Now print the resulting commit, or if prunable skip it
4133    if not commit.dumped:
4134      if not self._prunable(commit, new_1st_parent,
4135                            aux_info['had_file_changes'], orig_parents):
4136        self._insert_into_stream(commit)
4137        self._record_remapping(commit, orig_parents)
4138      else:
4139        rewrite_to = new_1st_parent or commit.first_parent()
4140        commit.skip(new_id = rewrite_to)
4141        if self._args.state_branch:
4142          alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash)
4143          self._insert_into_stream(alias)
4144        if commit.branch.startswith(b'refs/') or commit.branch == b'HEAD':
4145          # The special check above is because when direct revisions are passed
4146          # along to fast-export (such as with stashes), there is a chance the
4147          # revision is rewritten to nothing.  In such cases, we don't want to
4148          # point an invalid ref that just names a revision to some other point.
4149          reset = Reset(commit.branch, rewrite_to or deleted_hash)
4150          self._insert_into_stream(reset)
4151        self._commit_renames[commit.original_id] = None
4152
4153    # Show progress
4154    self._num_commits += 1
4155    if not self._args.quiet:
4156      self._progress_writer.show(self._parsed_message % self._num_commits)
4157
4158  @staticmethod
4159  def _do_tag_rename(rename_pair, tagname):
4160    old, new = rename_pair.split(b':', 1)
4161    old, new = b'refs/tags/'+old, b'refs/tags/'+new
4162    if tagname.startswith(old):
4163      return tagname.replace(old, new, 1)
4164    return tagname
4165
4166  def _tweak_tag(self, tag):
4167    # Tweak the tag message according to callbacks
4168    if self._args.replace_message:
4169      for literal, replacement in self._args.replace_message['literals']:
4170        tag.message = tag.message.replace(literal, replacement)
4171      for regex,   replacement in self._args.replace_message['regexes']:
4172        tag.message = regex.sub(replacement, tag.message)
4173    if self._message_callback:
4174      tag.message = self._message_callback(tag.message)
4175
4176    # Tweak the tag name according to tag-name-related callbacks
4177    tag_prefix = b'refs/tags/'
4178    fullref = tag_prefix+tag.ref
4179    if self._args.tag_rename:
4180      fullref = RepoFilter._do_tag_rename(self._args.tag_rename, fullref)
4181    if self._refname_callback:
4182      fullref = self._refname_callback(fullref)
4183      if not fullref.startswith(tag_prefix):
4184        msg = "Error: fast-import requires tags to be in refs/tags/ namespace."
4185        msg += "\n       {} renamed to {}".format(tag_prefix+tag.ref, fullref)
4186        raise SystemExit(msg)
4187    tag.ref = fullref[len(tag_prefix):]
4188
4189    # Tweak the tagger according to callbacks
4190    if self._args.mailmap:
4191      tag.tagger_name, tag.tagger_email = \
4192          self._args.mailmap.translate(tag.tagger_name, tag.tagger_email)
4193    if self._name_callback:
4194      tag.tagger_name = self._name_callback(tag.tagger_name)
4195    if self._email_callback:
4196      tag.tagger_email = self._email_callback(tag.tagger_email)
4197
4198    # Call general purpose tag callback
4199    if self._tag_callback:
4200      self._tag_callback(tag, self.callback_metadata())
4201
4202  def _tweak_reset(self, reset):
4203    if self._args.tag_rename:
4204      reset.ref = RepoFilter._do_tag_rename(self._args.tag_rename, reset.ref)
4205    if self._refname_callback:
4206      reset.ref = self._refname_callback(reset.ref)
4207    if self._reset_callback:
4208      self._reset_callback(reset, self.callback_metadata())
4209
4210  def results_tmp_dir(self, create_if_missing=True):
4211    target_working_dir = self._args.target or b'.'
4212    git_dir = GitUtils.determine_git_dir(target_working_dir)
4213    d = os.path.join(git_dir, b'filter-repo')
4214    if create_if_missing and not os.path.isdir(d):
4215      os.mkdir(d)
4216    return d
4217
4218  def _load_marks_file(self, marks_basename):
4219    full_branch = 'refs/heads/{}'.format(self._args.state_branch)
4220    marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
4221    working_dir = self._args.target or b'.'
4222    cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
4223    contents = b''
4224    if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
4225      cmd = ['git', '-C', working_dir, 'show',
4226             '%s:%s' % (full_branch, decode(marks_basename))]
4227      try:
4228        contents = subproc.check_output(cmd)
4229      except subprocess.CalledProcessError as e: # pragma: no cover
4230        raise SystemExit(_("Failed loading %s from %s") %
4231                         (decode(marks_basename), full_branch))
4232    if contents:
4233      biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines())
4234      _IDS._next_id = max(_IDS._next_id, biggest_id+1)
4235    with open(marks_file, 'bw') as f:
4236      f.write(contents)
4237    return marks_file
4238
4239  def _save_marks_files(self):
4240    basenames = [b'source-marks', b'target-marks']
4241    working_dir = self._args.target or b'.'
4242
4243    # Check whether the branch exists
4244    parent = []
4245    full_branch = 'refs/heads/{}'.format(self._args.state_branch)
4246    cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
4247    if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
4248      parent = ['-p', full_branch]
4249
4250    # Run 'git hash-object $MARKS_FILE' for each marks file, save result
4251    blob_hashes = {}
4252    for marks_basename in basenames:
4253      marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
4254      if not os.path.isfile(marks_file): # pragma: no cover
4255        raise SystemExit(_("Failed to find %s to save to %s")
4256                         % (marks_file, self._args.state_branch))
4257      cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file]
4258      blob_hashes[marks_basename] = subproc.check_output(cmd).strip()
4259
4260    # Run 'git mktree' to create a tree out of it
4261    p = subproc.Popen(['git', '-C', working_dir, 'mktree'],
4262                      stdin=subprocess.PIPE, stdout=subprocess.PIPE)
4263    for b in basenames:
4264      p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b))
4265    p.stdin.close()
4266    p.wait()
4267    tree = p.stdout.read().strip()
4268
4269    # Create the new commit
4270    cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files',
4271            tree] + parent)
4272    commit = subproc.check_output(cmd).strip()
4273    subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit])
4274
4275  def importer_only(self):
4276    self._run_sanity_checks()
4277    self._setup_output()
4278
4279  def set_output(self, outputRepoFilter):
4280    assert outputRepoFilter._output
4281
4282    # set_output implies this RepoFilter is doing exporting, though may not
4283    # be the only one.
4284    self._setup_input(use_done_feature = False)
4285
4286    # Set our output management up to pipe to outputRepoFilter's locations
4287    self._managed_output = False
4288    self._output = outputRepoFilter._output
4289    self._import_pipes = outputRepoFilter._import_pipes
4290
4291    # Handle sanity checks, though currently none needed for export-only cases
4292    self._run_sanity_checks()
4293
4294  def _read_stash(self):
4295    if self._stash:
4296      return
4297    if self._orig_refs and b'refs/stash' in self._orig_refs and \
4298       self._args.refs == ['--all']:
4299      repo_working_dir = self._args.source or b'.'
4300      git_dir = GitUtils.determine_git_dir(repo_working_dir)
4301      stash = os.path.join(git_dir, b'logs', b'refs', b'stash')
4302      if os.path.exists(stash):
4303        self._stash = []
4304        with open(stash, 'br') as f:
4305          for line in f:
4306            (oldhash, newhash, rest) = line.split(None, 2)
4307            self._stash.append((newhash, rest))
4308        self._args.refs.extend([x[0] for x in self._stash])
4309
4310  def _write_stash(self):
4311    last = deleted_hash
4312    if self._stash:
4313      target_working_dir = self._args.target or b'.'
4314      git_dir = GitUtils.determine_git_dir(target_working_dir)
4315      stash = os.path.join(git_dir, b'logs', b'refs', b'stash')
4316      with open(stash, 'bw') as f:
4317        for (hash, rest) in self._stash:
4318          new_hash = self._get_rename(hash)
4319          if new_hash is None:
4320            continue
4321          f.write(b' '.join([last, new_hash, rest]) + b'\n')
4322          last = new_hash
4323      print(_("Rewrote the stash."))
4324
4325  def _setup_input(self, use_done_feature):
4326    if self._args.stdin:
4327      self._input = sys.stdin.detach()
4328      sys.stdin = None # Make sure no one tries to accidentally use it
4329      self._fe_orig = None
4330    else:
4331      self._read_stash()
4332      skip_blobs = (self._blob_callback is None and
4333                    (self._args.replace_text is None or
4334                     self._file_info_callback is not None) and
4335                    self._args.source == self._args.target)
4336      extra_flags = []
4337      if skip_blobs:
4338        extra_flags.append('--no-data')
4339        if self._args.max_blob_size:
4340          self._unpacked_size, packed_size = GitUtils.get_blob_sizes()
4341      if use_done_feature:
4342        extra_flags.append('--use-done-feature')
4343      if write_marks:
4344        extra_flags.append(b'--mark-tags')
4345      if self._args.state_branch:
4346        assert(write_marks)
4347        source_marks_file = self._load_marks_file(b'source-marks')
4348        extra_flags.extend([b'--export-marks='+source_marks_file,
4349                            b'--import-marks='+source_marks_file])
4350      if self._args.preserve_commit_encoding is not None: # pragma: no cover
4351        reencode = 'no' if self._args.preserve_commit_encoding else 'yes'
4352        extra_flags.append('--reencode='+reencode)
4353      if self._args.date_order:
4354        extra_flags.append('--date-order')
4355      location = ['-C', self._args.source] if self._args.source else []
4356      fep_cmd = ['git'] + location + ['fast-export', '--show-original-ids',
4357                 '--signed-tags=strip', '--tag-of-filtered-object=rewrite',
4358                 '--fake-missing-tagger', '--reference-excluded-parents'
4359                 ] + extra_flags + self._args.refs
4360      self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE)
4361      self._input = self._fep.stdout
4362      if self._args.dry_run or self._args.debug:
4363        self._fe_orig = os.path.join(self.results_tmp_dir(),
4364                                     b'fast-export.original')
4365        output = open(self._fe_orig, 'bw')
4366        self._input = InputFileBackup(self._input, output)
4367        if self._args.debug:
4368          tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd]
4369          print("[DEBUG] Running: {}".format(' '.join(tmp)))
4370          print("  (saving a copy of the output at {})"
4371                .format(decode(self._fe_orig)))
4372
4373  def _setup_output(self):
4374    if not self._args.dry_run:
4375      location = ['-C', self._args.target] if self._args.target else []
4376      fip_cmd = ['git'] + location + ['-c', 'core.ignorecase=false',
4377                                      'fast-import', '--force', '--quiet']
4378      if date_format_permissive:
4379        fip_cmd.append('--date-format=raw-permissive')
4380      if self._args.state_branch:
4381        target_marks_file = self._load_marks_file(b'target-marks')
4382        fip_cmd.extend([b'--export-marks='+target_marks_file,
4383                        b'--import-marks='+target_marks_file])
4384      self._fip = subproc.Popen(fip_cmd, bufsize=-1,
4385                                stdin=subprocess.PIPE, stdout=subprocess.PIPE)
4386      self._import_pipes = (self._fip.stdin, self._fip.stdout)
4387    if self._args.dry_run or self._args.debug:
4388      self._fe_filt = os.path.join(self.results_tmp_dir(),
4389                                   b'fast-export.filtered')
4390      self._output = open(self._fe_filt, 'bw')
4391    else:
4392      self._output = self._fip.stdin
4393    if self._args.debug and not self._args.dry_run:
4394      self._output = DualFileWriter(self._fip.stdin, self._output)
4395      tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd]
4396      print("[DEBUG] Running: {}".format(' '.join(tmp)))
4397      print("  (using the following file as input: {})"
4398            .format(decode(self._fe_filt)))
4399
4400  def _migrate_origin_to_heads(self):
4401    source_working_dir = self._args.source or b'.'
4402    target_working_dir = self._args.target or b'.'
4403    refs_to_migrate = set(x for x in self._orig_refs
4404                          if x.startswith(b'refs/remotes/origin/'))
4405    refs_to_warn_about = set()
4406    if refs_to_migrate:
4407      if self._args.debug:
4408        print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*")
4409      p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
4410                        stdin=subprocess.PIPE, cwd=source_working_dir)
4411      for ref in refs_to_migrate:
4412        if ref == b'refs/remotes/origin/HEAD':
4413          p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
4414          del self._orig_refs[ref]
4415          continue
4416        newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/')
4417        if newref not in self._orig_refs:
4418          p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref]))
4419          self._orig_refs[newref] = self._orig_refs[ref]
4420        elif self._orig_refs[ref] != self._orig_refs[newref]:
4421          refs_to_warn_about.add(newref)
4422        p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
4423        del self._orig_refs[ref]
4424      p.stdin.close()
4425      if p.wait(): # pragma: no cover
4426        msg = _("git update-ref failed; see above")
4427        raise SystemExit(msg)
4428
4429    if b'remote.origin.url' not in self._config_settings:
4430      return
4431
4432    # For sensitive data removals, fetch ALL refs.  Non-mirror clones normally
4433    # only grab branches and tags, but other refs may hold on to the sensitive
4434    # data as well.
4435    if self._args.sensitive_data_removal and \
4436       not self._args.no_fetch and \
4437       not self._already_ran and \
4438       self._config_settings.get(b'remote.origin.mirror', b'false') != b'true':
4439
4440      if refs_to_warn_about:
4441        msg = ("Warning: You have refs modified from upstream:\n             " +
4442               "\n            ".join([decode(x) for x in refs_to_warn_about]) +
4443               "\n" +
4444               "         We want to forcibly fetch from upstream to ensure\n" +
4445               "         that all relevent refs are rewritten, but this will\n" +
4446               "         discard your local changes before starting the\n" +
4447               "         rewrite.  Proceed with fetch (Y/N)?")
4448        response = input(msg)
4449
4450        if response.lower() != 'y':
4451          self._args.no_fetch = True
4452          # Don't do the fetch, and don't remove the origin remote
4453          return
4454
4455      cmd = 'git fetch -q --prune --update-head-ok --refmap "" origin +refs/*:refs/*'
4456      m = _("NOTICE: Fetching all refs from origin to make sure we rewrite\n"
4457            "        all history that may reference the sensitive data, via\n"
4458            "      "+cmd)
4459      print(m)
4460      ret = subproc.call([arg if arg != '""' else '' for arg in cmd.split()],
4461                         cwd=source_working_dir)
4462      if ret != 0: # pragma: no cover
4463        m = _("Warning: Fetching all refs from origin failed")
4464        print(m)
4465    if self._args.sensitive_data_removal:
4466      return
4467
4468    # Now remove the origin remote
4469    url = self._config_settings[b'remote.origin.url'].decode(errors='replace')
4470    m = _("NOTICE: Removing 'origin' remote; see 'Why is my origin removed?'\n"
4471          "        in the manual if you want to push back there.\n"
4472          "        (was %s)") % url
4473    print(m)
4474    subproc.call('git remote rm origin'.split(), cwd=target_working_dir)
4475
4476  def _final_commands(self):
4477    self._finalize_handled = True
4478    self._done_callback and self._done_callback()
4479
4480    if self._file_info_value:
4481      self._file_info_value.finalize()
4482    if not self._args.quiet:
4483      self._progress_writer.finish()
4484
4485  def _ref_update(self, target_working_dir):
4486    # Start the update-ref process
4487    p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
4488                      stdin=subprocess.PIPE,
4489                      cwd=target_working_dir)
4490
4491    # Remove replace_refs from _orig_refs
4492    replace_refs = {k:v for k, v in self._orig_refs.items()
4493                    if k.startswith(b'refs/replace/')}
4494    reverse_replace_refs = collections.defaultdict(list)
4495    for k,v in replace_refs.items():
4496      reverse_replace_refs[v].append(k)
4497    all(map(self._orig_refs.pop, replace_refs))
4498
4499    # Remove unused refs
4500    exported_refs, imported_refs = self.get_exported_and_imported_refs()
4501    refs_to_nuke = exported_refs - imported_refs
4502    # Because revisions can be passed to fast-export which handles them as
4503    # though they were refs, we might have bad "refs" to nuke; strip them out.
4504    refs_to_nuke = [x for x in refs_to_nuke
4505                    if x.startswith(b'refs/') or x == b'HEAD']
4506    if self._args.partial:
4507      refs_to_nuke = set()
4508    if refs_to_nuke and self._args.debug:
4509      print("[DEBUG] Deleting the following refs:\n  "+
4510            decode(b"\n  ".join(sorted(refs_to_nuke))))
4511    p.stdin.write(b''.join([b"delete %s\n" % x
4512                           for x in refs_to_nuke]))
4513
4514    # Delete or update and add replace_refs; note that fast-export automatically
4515    # handles 'update-no-add', we only need to take action for the other four
4516    # choices for replace_refs.
4517    self._flush_renames()
4518    actual_renames = {k:v for k,v in self._commit_renames.items() if k != v}
4519    if self._args.replace_refs in ['delete-no-add', 'delete-and-add']:
4520      # Delete old replace refs, if unwanted
4521      replace_refs_to_nuke = set(replace_refs)
4522      if self._args.replace_refs == 'delete-and-add':
4523        # git-update-ref won't allow us to update a ref twice, so be careful
4524        # to avoid deleting refs we'll later update
4525        replace_refs_to_nuke = replace_refs_to_nuke.difference(
4526                                 [b'refs/replace/'+x for x in actual_renames])
4527      p.stdin.write(b''.join([b"delete %s\n" % x
4528                             for x in replace_refs_to_nuke]))
4529    if self._args.replace_refs in ['delete-and-add', 'update-or-add',
4530                                   'update-and-add']:
4531      # Add new replace refs
4532      update_only = (self._args.replace_refs == 'update-or-add')
4533      p.stdin.write(b''.join([b"update refs/replace/%s %s\n" % (old, new)
4534                              for old,new in actual_renames.items()
4535                              if new and not (update_only and
4536                                              old in reverse_replace_refs)]))
4537
4538    # Complete the update-ref process
4539    p.stdin.close()
4540    if p.wait():
4541      raise SystemExit(_("git update-ref failed; see above")) # pragma: no cover
4542
4543  def _remap_to(self, oldish_hash):
4544    '''
4545    Given an oldish_hash (from the beginning of the current run), return:
4546       IF oldish_hash is NOT pruned:
4547         the hash of the rewrite of oldish_hash
4548       otherwise:
4549         the hash of the rewrite of the first unpruned ancestor of oldish_hash
4550    '''
4551    old_id = self._orig_graph._hash_to_id[oldish_hash]
4552    new_id = _IDS.translate(old_id)
4553    new_hash = self._graph.git_hash[new_id] if new_id else deleted_hash
4554    return new_hash
4555
4556  def _compute_metadata(self, metadata_dir, orig_refs):
4557    #
4558    # First, handle commit_renames
4559    #
4560    old_commit_renames = dict()
4561    if not self._already_ran:
4562      commit_renames = {old: new
4563                        for old, new in self._commit_renames.items()
4564                       }
4565    else:
4566      # Read commit-map into old_commit_renames
4567      with open(os.path.join(metadata_dir, b'commit-map'), 'br') as f:
4568        f.readline() # Skip the header line
4569        for line in f:
4570          (old,new) = line.split()
4571          old_commit_renames[old] = new
4572      # Use A->B mappings in old_commit_renames, and B->C mappings in
4573      # self._commit_renames to yield A->C mappings in commit_renames
4574      commit_renames = {old: self._commit_renames.get(newish, newish)
4575                        for old, newish in old_commit_renames.items()}
4576      # If there are any B->C mappings in self._commit_renames for which
4577      # there was no A->B mapping in old_commit_renames, then add the
4578      # B->C mapping to commit_renames too.
4579      seen = set(old_commit_renames.values())
4580      commit_renames.update({old: new
4581                             for old, new in self._commit_renames.items()
4582                             if old not in seen})
4583
4584    #
4585    # Second, handle ref_maps
4586    #
4587    exported_refs, imported_refs = self.get_exported_and_imported_refs()
4588
4589    old_commit_unrenames = dict()
4590    if not self._already_ran:
4591      old_ref_map = dict((refname, (old_hash, deleted_hash))
4592                         for refname, old_hash in orig_refs.items()
4593                         if refname in exported_refs)
4594    else:
4595      # old_commit_renames talk about how commits were renamed in the original
4596      # run.  Let's reverse it to find out how to get from the intermediate
4597      # commit name, back to the original.  Because everything in orig_refs
4598      # right now refers to the intermediate commits after the first run(s),
4599      # and we need to map them back to what they were before any changes.
4600      old_commit_unrenames = dict((v,k) for (k,v) in old_commit_renames.items())
4601
4602      old_ref_map = {}
4603      # Populate old_ref_map from the 'ref-map' file
4604      with open(os.path.join(metadata_dir, b'ref-map'), 'br') as f:
4605        f.readline() # Skip the header line
4606        for line in f:
4607          (old,intermediate,ref) = line.split()
4608          old_ref_map[ref] = (old, intermediate)
4609      # Append to old_ref_map items from orig_refs that were exported, but
4610      # get the actual original commit name
4611      for refname, old_hash in orig_refs.items():
4612        if refname in old_ref_map:
4613          continue
4614        if refname not in exported_refs:
4615          continue
4616        # Compute older_hash
4617        original_hash = old_commit_unrenames.get(old_hash, old_hash)
4618        old_ref_map[refname] = (original_hash, deleted_hash)
4619
4620    new_refs = {}
4621    new_refs_initialized = False
4622    ref_maps = {}
4623    self._orig_graph._ensure_reverse_maps_populated()
4624    for refname, pair in old_ref_map.items():
4625      old_hash, hash_ref_becomes_if_not_imported_in_this_run = pair
4626      if refname not in imported_refs:
4627        new_hash = hash_ref_becomes_if_not_imported_in_this_run
4628      elif old_hash in commit_renames:
4629        intermediate = old_commit_renames.get(old_hash,old_hash)
4630        if intermediate in self._commit_renames:
4631          new_hash = self._remap_to(intermediate)
4632        else:
4633          new_hash = intermediate
4634      else: # Must be either an annotated tag, or a ref whose tip was pruned
4635        if not new_refs_initialized:
4636          target_working_dir = self._args.target or b'.'
4637          new_refs = GitUtils.get_refs(target_working_dir)
4638          new_refs_initialized = True
4639        if refname in new_refs:
4640          new_hash = new_refs[refname]
4641        else:
4642          new_hash = deleted_hash
4643      ref_maps[refname] = (old_hash, new_hash)
4644    if self._args.source or self._args.target:
4645      if not new_refs_initialized:
4646        target_working_dir = self._args.target or b'.'
4647        new_refs = GitUtils.get_refs(target_working_dir)
4648        new_refs_initialized = True
4649      for ref, new_hash in new_refs.items():
4650        if ref not in orig_refs and not ref.startswith(b'refs/replace/'):
4651          old_hash = b'0'*len(new_hash)
4652          ref_maps[ref] = (old_hash, new_hash)
4653
4654    #
4655    # Third, handle first_changes
4656    #
4657
4658    old_first_changes = dict()
4659    if self._already_ran:
4660      # Read first_changes into old_first_changes
4661      with open(os.path.join(metadata_dir, b'first-changed-commits'), 'br') as f:
4662        for line in f:
4663          changed_commit, undeleted_self_or_ancestor = line.strip().split()
4664          old_first_changes[changed_commit] = undeleted_self_or_ancestor
4665    # We need to find the commits that were modified whose parents were not.
4666    # To be able to find parents, we need the commit names as of the beginning
4667    # of this run, and then when we are done, we need to map them back to the
4668    # name of the commits from before any git-filter-repo runs.
4669    #
4670    # We are excluding here any commits deleted in previous git-filter-repo
4671    # runs
4672    undo_old_commit_renames = dict((v,k) for (k,v) in old_commit_renames.items()
4673                                   if v != deleted_hash)
4674    # Get a list of all commits that were changed, as of the beginning of
4675    # this latest run.
4676    changed_commits = {new
4677                       for (old,new) in old_commit_renames.items()
4678                       if old != new and new != deleted_hash} | \
4679                      {old
4680                       for (old,new) in self._commit_renames.items()
4681                       if old != new}
4682    special_changed_commits = {old
4683                               for (old,new) in old_commit_renames.items()
4684                               if new == deleted_hash}
4685    first_changes = dict()
4686    for (old,new) in self._commit_renames.items():
4687      if old == new:
4688        # old wasn't modified, can't be first change if not even a change
4689        continue
4690      if old_commit_unrenames.get(old,old) != old:
4691        # old was already modified in previous run; while it might represent
4692        # something that is still a first change, we'll handle that as we
4693        # loop over old_first_changes below
4694        continue
4695      if any(parent in changed_commits
4696             for parent in self._orig_graph.get_parent_hashes(old)):
4697        # a parent of old was modified, so old is not a first change
4698        continue
4699      # At this point, old IS a first change.  We need to find out what new
4700      # commit it maps to, or if it doesn't map to one, what new commit was
4701      # its most recent ancestor that wasn't pruned.
4702      if new is None:
4703        new = self._remap_to(old)
4704      first_changes[old] = (new if new is not None else deleted_hash)
4705    for (old,undeleted_self_or_ancestor) in old_first_changes.items():
4706      if undeleted_self_or_ancestor == deleted_hash:
4707        # old represents a commit that was pruned and whose entire ancestry
4708        # was pruned.  So, old is still a first change
4709        first_changes[old] = undeleted_self_or_ancestor
4710        continue
4711      intermediate = old_commit_renames.get(old, old)
4712      usoa = undeleted_self_or_ancestor
4713      new_ancestor = self._commit_renames.get(usoa, usoa)
4714      if intermediate == deleted_hash:
4715        # old was pruned in previous rewrite
4716        if usoa != new_ancestor:
4717          # old's ancestor got rewritten in this filtering run; we can drop
4718          # this one from first_changes.
4719          continue
4720        # Getting here means old was a first change and old was pruned in a
4721        # previous run, and its ancestors that survived were non rewritten in
4722        # this run, so old remains a first change
4723        first_changes[old] = new_ancestor # or usoa, since new_ancestor == usoa
4724        continue
4725      assert(usoa == intermediate) # old wasn't pruned => usoa == intermediate
4726
4727      # Check whether parents of intermediate were rewritten.  Note that
4728      # intermediate in self._commit_renames only means that intermediate was
4729      # processed by the latest filtering (not necessarily that it changed),
4730      # but we need to know that before we can check for parent hashes having
4731      # changed.
4732      if intermediate not in self._commit_renames:
4733        # This commit was not processed by this run, so it remains a first
4734        # change
4735        first_changes[old] = usoa
4736        continue
4737      if any(parent in changed_commits
4738             for parent in self._orig_graph.get_parent_hashes(intermediate)):
4739        # An ancestor was modified by this run, so it is no longer a first
4740        # change; continue to the next one.
4741        continue
4742      # This change is a first_change; find the new commit its usoa maps to
4743      new = self._remap_to(intermediate)
4744      assert(new is not None)
4745      first_changes[old] = new
4746
4747    return commit_renames, ref_maps, first_changes
4748
4749  def _handle_lfs_metadata(self, metadata_dir):
4750    if self._lfs_object_tracker is None:
4751      print("NOTE: LFS object orphaning not checked (LFS not in use)")
4752      return
4753
4754    if self._args.partial:
4755      target_working_dir = self._args.target or b'.'
4756      source = False
4757      self._lfs_object_tracker.find_all_lfs_objects_in_repo(target_working_dir,
4758                                                            source)
4759
4760    with open(os.path.join(metadata_dir, b'original_lfs_objects'), 'bw') as f:
4761      for obj in sorted(self._lfs_object_tracker.source_objects.objects):
4762        f.write(obj+b"\n")
4763
4764    orphaned_lfs_path = os.path.join(metadata_dir, b'orphaned_lfs_objects')
4765    msg = textwrap.dedent(_(f"""\
4766      NOTE: There were LFS Objects Orphaned by this rewrite recorded in
4767            {decode(orphaned_lfs_path)}."""))
4768    with open(orphaned_lfs_path, 'bw') as f:
4769      differences = self._lfs_object_tracker.source_objects.objects - \
4770                    self._lfs_object_tracker.target_objects.objects
4771      for obj in sorted(differences):
4772        f.write(obj+b"\n")
4773      if differences:
4774        self._lfs_object_tracker.objects_orphaned = True
4775        print(msg)
4776
4777  def _record_metadata(self, metadata_dir, orig_refs):
4778    self._flush_renames()
4779    commit_renames, ref_maps, first_changes = \
4780      self._compute_metadata(metadata_dir, orig_refs)
4781
4782    if self._args.sensitive_data_removal:
4783      changed_commits = sum(k!=v for (k,v) in commit_renames.items())
4784      print(f"You rewrote {changed_commits} (of {len(commit_renames)}) commits.")
4785      print("") # Add a blank line before important rewrite information
4786      print(f"NOTE: First Changed Commit(s) is/are:\n  "
4787            + decode(b"\n  ".join(x for x in first_changes)))
4788
4789      with open(os.path.join(metadata_dir, b'sensitive_data_removal'), 'bw') as f:
4790        pass # Write nothing; we only need the file created
4791
4792      self._handle_lfs_metadata(metadata_dir)
4793      print("") # Add a blank line after important rewrite information
4794
4795    with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f:
4796      f.write(("%-40s %s\n" % (_("old"), _("new"))).encode())
4797      for (old,new) in sorted(commit_renames.items()):
4798        msg = b'%s %s\n' % (old, new if new != None else deleted_hash)
4799        f.write(msg)
4800
4801    with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f:
4802      f.write(("%-40s %-40s %s\n" % (_("old"), _("new"), _("ref"))).encode())
4803      for refname, hash_pair in sorted(ref_maps.items()):
4804        (old_hash, new_hash) = hash_pair
4805        f.write(b'%s %s %s\n' % (old_hash, new_hash, refname))
4806        if old_hash != new_hash:
4807          self._changed_refs.add(refname)
4808
4809    with open(os.path.join(metadata_dir, b'changed-refs'), 'bw') as f:
4810      for refname in sorted(self._changed_refs):
4811        f.write(b'%s\n' % refname)
4812
4813    with open(os.path.join(metadata_dir, b'first-changed-commits'), 'bw') as f:
4814      for commit, undeleted_self_or_ancestor in sorted(first_changes.items()):
4815        f.write(b'%s %s\n' % (commit, undeleted_self_or_ancestor))
4816
4817    with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f:
4818      issues_found = False
4819      if self._commits_no_longer_merges:
4820        issues_found = True
4821
4822        f.write(textwrap.dedent(_('''
4823          The following commits used to be merge commits but due to filtering
4824          are now regular commits; they likely have suboptimal commit messages
4825          (e.g. "Merge branch next into master").  Original commit hash on the
4826          left, commit hash after filtering/rewriting on the right:
4827          ''')[1:]).encode())
4828        for oldhash, newhash in self._commits_no_longer_merges:
4829          f.write('  {} {}\n'.format(oldhash, newhash).encode())
4830        f.write(b'\n')
4831
4832      if self._commits_referenced_but_removed:
4833        issues_found = True
4834        f.write(textwrap.dedent(_('''
4835          The following commits were filtered out, but referenced in another
4836          commit message.  The reference to the now-nonexistent commit hash
4837          (or a substring thereof) was left as-is in any commit messages:
4838          ''')[1:]).encode())
4839        for bad_commit_reference in self._commits_referenced_but_removed:
4840          f.write('  {}\n'.format(bad_commit_reference).encode())
4841        f.write(b'\n')
4842
4843      if not issues_found:
4844        f.write(_("No filtering problems encountered.\n").encode())
4845
4846    with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f:
4847       f.write(_("This file exists to allow you to filter again without --force,\n"
4848                 "and to specify that metadata files should be updated instead\n"
4849                 "of rewritten").encode())
4850
4851  def finish(self):
4852    ''' Alternative to run() when there is no input of our own to parse,
4853        meaning that run only really needs to close the handle to fast-import
4854        and let it finish, thus making a call to "run" feel like a misnomer. '''
4855    assert not self._input
4856    assert self._managed_output
4857    self.run()
4858
4859  def insert(self, obj, direct_insertion = False):
4860    if not direct_insertion:
4861      if type(obj) == Blob:
4862        self._tweak_blob(obj)
4863      elif type(obj) == Commit:
4864        aux_info = {'orig_parents': obj.parents,
4865                    'had_file_changes': bool(obj.file_changes)}
4866        self._tweak_commit(obj, aux_info)
4867      elif type(obj) == Reset:
4868        self._tweak_reset(obj)
4869      elif type(obj) == Tag:
4870        self._tweak_tag(obj)
4871    self._insert_into_stream(obj)
4872
4873  def _insert_into_stream(self, obj):
4874    if not obj.dumped:
4875      if self._lfs_object_tracker:
4876        self._lfs_object_tracker.check_output_object(obj)
4877      if self._parser:
4878        self._parser.insert(obj)
4879      else:
4880        obj.dump(self._output)
4881
4882  def get_exported_and_imported_refs(self):
4883    return self._parser.get_exported_and_imported_refs()
4884
4885  def run(self):
4886    start = time.time()
4887    if not self._input and not self._output:
4888      self._run_sanity_checks()
4889      if not self._args.dry_run and not self._args.partial:
4890        self._read_stash()
4891        self._migrate_origin_to_heads()
4892      self._setup_input(use_done_feature = True)
4893      self._setup_output()
4894    assert self._sanity_checks_handled
4895
4896    if self._input:
4897      # Create and run the filter
4898      self._repo_working_dir = self._args.source or b'.'
4899      self._parser = FastExportParser(blob_callback   = self._tweak_blob,
4900                                      commit_callback = self._tweak_commit,
4901                                      tag_callback    = self._tweak_tag,
4902                                      reset_callback  = self._tweak_reset,
4903                                      done_callback   = self._final_commands)
4904      self._setup_lfs_orphaning_checks()
4905      self._parser.run(self._input, self._output)
4906      if not self._finalize_handled:
4907        self._final_commands()
4908
4909      # Make sure fast-export completed successfully
4910      if not self._args.stdin and self._fep.wait():
4911        raise SystemExit(_("Error: fast-export failed; see above.")) # pragma: no cover
4912      self._input.close()
4913
4914    # If we're not the manager of self._output, we should avoid post-run cleanup
4915    if not self._managed_output:
4916      return
4917
4918    # Close the output and ensure fast-import successfully completes
4919    self._output.close()
4920    if not self._args.dry_run and self._fip.wait():
4921      raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover
4922
4923    # With fast-export and fast-import complete, update state if requested
4924    if self._args.state_branch:
4925      self._save_marks_files()
4926
4927    # Notify user how long it took, before doing a gc and such
4928    msg = "New history written in {:.2f} seconds..."
4929    if self._args.repack:
4930      msg = "New history written in {:.2f} seconds; now repacking/cleaning..."
4931    print(msg.format(time.time()-start))
4932
4933    # Exit early, if requested
4934    if self._args.dry_run:
4935      print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed."))
4936      if self._fe_orig:
4937        print(_("      Requested filtering can be seen by comparing:"))
4938        print("        " + decode(self._fe_orig))
4939      else:
4940        print(_("      Requested filtering can be seen at:"))
4941      print("        " + decode(self._fe_filt))
4942      return
4943
4944    target_working_dir = self._args.target or b'.'
4945    if self._input:
4946      self._ref_update(target_working_dir)
4947
4948      # Write out data about run
4949      self._record_metadata(self.results_tmp_dir(), self._orig_refs)
4950
4951    # Final cleanup:
4952    #   If we need a repack, then nuke the reflogs and repack.
4953    #   If we need a reset, do a reset --hard
4954    reset = not GitUtils.is_repository_bare(target_working_dir)
4955    self.cleanup(target_working_dir, self._args.repack, reset,
4956                 run_quietly=self._args.quiet,
4957                 show_debuginfo=self._args.debug)
4958
4959    # Let user know how long it took
4960    print(_("Completely finished after {:.2f} seconds.")
4961          .format(time.time()-start))
4962
4963    # Give post-rewrite instructions for cleaning up other copies for SDR
4964    if self._args.sensitive_data_removal:
4965      lfs_note = ""
4966      if self._lfs_object_tracker and \
4967         self._lfs_object_tracker.objects_orphaned == True:
4968        lfs_note = _(" and LFS Objects Orphaned")
4969      push_command = "git push --force --mirror origin"
4970      if self._args.no_fetch:
4971        if self._args.partial:
4972          push_command = "git push --force origin " + \
4973                     " ".join(sorted([decode(x) for x in self._changed_refs]))
4974        else:
4975          push_command = "git push --all --tags origin"
4976      print("")
4977      print(sdr_next_steps % (push_command, lfs_note, lfs_note))
4978
4979def main():
4980  setup_gettext()
4981  args = FilteringOptions.parse_args(sys.argv[1:])
4982  if args.analyze:
4983    RepoAnalyze.run(args)
4984  else:
4985    filter = RepoFilter(args)
4986    filter.run()
4987
4988if __name__ == '__main__':
4989  main()