所以,我发现了这个问题:
How to assign a Git SHA1's to a file without Git?
但我不确定如何对目录执行此方法。如何在不使用git的情况下哈希程序中的目录,使其与git给定的sha1相匹配?
最佳答案
事实证明,这比我预期的要难,但是我现在确实可以工作了。
与I commented和hobbs answered一样,计算树哈希值并非易事。您必须对每个子树中的每个文件进行哈希处理,计算这些子树的哈希,然后使用这些哈希来计算顶级树的哈希。
所附的python代码似乎至少在某些测试案例中有效(例如,为git源本身计算树哈希)。我在评论中对我在此过程中发现的一些意外情况进行了解释。
现在也在my github "scripts" repository中。
[编辑:github版本现在已经对Python3进行了修复,并且通常可能会更新/更好。
#! /usr/bin/env python
"""
Compute git hash values.
This is meant to work with both Python2 and Python3, but
has only been tested with Python2.7.
"""
from __future__ import print_function
import argparse
import os
import stat
import sys
from hashlib import sha1
def strmode(mode):
"""
Turn internal mode (octal with leading 0s suppressed) into
print form (i.e., left pad => right justify with 0s as needed).
"""
return mode.rjust(6, '0')
def classify(path):
"""
Return git classification of a path (as both mode,
100644/100755 etc, and git object type, i.e., blob vs tree).
Also throw in st_size field since we want it for file blobs.
"""
# We need the X bit of regular files for the mode, so
# might as well just use lstat rather than os.isdir().
st = os.lstat(path)
if stat.S_ISLNK(st.st_mode):
gitclass = 'blob'
mode = '120000'
elif stat.S_ISDIR(st.st_mode):
gitclass = 'tree'
mode = '40000' # note: no leading 0!
elif stat.S_ISREG(st.st_mode):
# 100755 if any execute permission bit set, else 100644
gitclass = 'blob'
mode = '100755' if (st.st_mode & 0111) != 0 else '100644'
else:
raise ValueError('un-git-able file system entity %s' % fullpath)
return mode, gitclass, st.st_size
def blob_hash(stream, size):
"""
Return (as hash instance) the hash of a blob,
as read from the given stream.
"""
hasher = sha1()
hasher.update(b'blob %u\0' % size)
nread = 0
while True:
# We read just 64K at a time to be kind to
# runtime storage requirements.
data = stream.read(65536)
if data == '':
break
nread += len(data)
hasher.update(data)
if nread != size:
raise ValueError('%s: expected %u bytes, found %u bytes' %
(stream.name, size, nread))
return hasher
def symlink_hash(path):
"""
Return (as hash instance) the hash of a symlink.
Caller must use hexdigest() or digest() as needed on
the result.
"""
hasher = sha1()
# XXX os.readlink produces a string, even though the
# underlying data read from the inode (which git will hash)
# are raw bytes. It's not clear what happens if the raw
# data bytes are not decode-able into Unicode; it might
# be nice to have a raw_readlink.
data = os.readlink(path).encode('utf8')
hasher.update(b'blob %u\0' % len(data))
hasher.update(data)
return hasher
def tree_hash(path, args):
"""
Return the hash of a tree. We need to know all
files and sub-trees. Since order matters, we must
walk the sub-trees and files in their natural (byte) order,
so we cannot use os.walk.
This is also slightly defective in that it does not know
about .gitignore files (we can't just read them since git
retains files that are in the index, even if they would be
ignored by a .gitignore directive).
We also do not (cannot) deal with submodules here.
"""
# Annoyingly, the tree object encodes its size, which requires
# two passes, one to find the size and one to compute the hash.
contents = os.listdir(path)
tsize = 0
to_skip = ('.', '..') if args.keep_dot_git else ('.', '..', '.git')
pass1 = []
for entry in contents:
if entry not in to_skip:
fullpath = os.path.join(path, entry)
mode, gitclass, esize = classify(fullpath)
# git stores as mode<sp><entry-name>\0<digest-bytes>
encoded_form = entry.encode('utf8')
tsize += len(mode) + 1 + len(encoded_form) + 1 + 20
pass1.append((fullpath, mode, gitclass, esize, encoded_form))
# Git's cache sorts foo/bar before fooXbar but after foo-bar,
# because it actually stores foo/bar as the literal string
# "foo/bar" in the index, rather than using recursion. That is,
# a directory name should sort as if it ends with '/' rather than
# with '\0'. Sort pass1 contents with funky sorting.
#
# (i[4] is the utf-8 encoded form of the name, i[1] is the
# mode which is '40000' for directories.)
pass1.sort(key = lambda i: i[4] + '/' if i[1] == '40000' else i[4])
args.depth += 1
hasher = sha1()
hasher.update(b'tree %u\0' % tsize)
for (fullpath, mode, gitclass, esize, encoded_form) in pass1:
sub_hash = generic_hash(fullpath, mode, esize, args)
if args.debug: # and args.depth == 0:
print('%s%s %s %s\t%s' % (' ' * args.depth,
strmode(mode), gitclass, sub_hash.hexdigest(),
encoded_form.decode('utf8')))
# Annoyingly, git stores the tree hash as 20 bytes, rather
# than 40 ASCII characters. This is why we return the
# hash instance (so we can use .digest() directly).
# The format here is <mode><sp><path>\0<raw-hash>.
hasher.update(b'%s %s\0' % (mode, encoded_form))
hasher.update(sub_hash.digest())
args.depth -= 1
return hasher
def generic_hash(path, mode, size, args):
"""
Hash an object based on its mode.
"""
if mode == '120000':
hasher = symlink_hash(path)
elif mode == '40000':
hasher = tree_hash(path, args)
else:
# 100755 if any execute permission bit set, else 100644
with open(path, 'rb') as stream:
hasher = blob_hash(stream, size)
return hasher
def main():
"""
Parse arguments and invoke hashers.
"""
parser = argparse.ArgumentParser('compute git hashes')
parser.add_argument('-d', '--debug', action='store_true')
parser.add_argument('-k', '--keep-dot-git', action='store_true')
parser.add_argument('path', nargs='+')
args = parser.parse_args()
args.depth = -1 # for debug print
status = 0
for path in args.path:
try:
try:
mode, gitclass, size = classify(path)
except ValueError:
print('%s: unhashable!' % path)
status = 1
continue
hasher = generic_hash(path, mode, size, args)
result = hasher.hexdigest()
if args.debug:
print('%s %s %s\t%s' % (strmode(mode), gitclass, result,
path))
else:
print('%s: %s hash = %s' % (path, gitclass, result))
except OSError as err:
print(str(err))
status = 1
sys.exit(status)
if __name__ == '__main__':
try:
sys.exit(main())
except KeyboardInterrupt:
sys.exit('\nInterrupted')
关于python - 在没有git的情况下分配目录的git SHA,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/36657399/