编写一个简单的程序可以在我的计算机上找到完全重复的文件,但速度有点慢。有什么办法可以加快速度吗?

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;

namespace DupeFinder
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.WriteLine("Getting files...");
            var files = Directory.GetFiles(@"D:\Photos", "*", SearchOption.AllDirectories);
            var alg = new HMACMD5();
            var dict = new Dictionary<string,List<string>>();
            Console.WriteLine("Computing hashes...");
            int i=0;
            int cursorLeft = Console.CursorLeft;
            int cursorTop = Console.CursorTop;
            foreach(var fileName in files)
            {
                Console.SetCursorPosition(cursorLeft,cursorTop);
                Console.Write("Hashing file {0}/{1}", ++i, files.Length);
                using(var stream = new BufferedStream(File.OpenRead(fileName),1024*1024*5))
                {
                    var hash = alg.ComputeHash(stream);
                    var str = BitConverter.ToString(hash);
                    if (!dict.ContainsKey(str)) dict[str] = new List<string>();
                    dict[str].Add(fileName);
                }
            }
            Console.WriteLine();

            foreach(var dupe in dict.Where(p => p.Value.Count >= 2))
            {
                Console.WriteLine(string.Join(", ", dupe.Value));
            }

            Console.WriteLine("Done!");
            Console.ReadLine();
        }
    }
}

可能的优化:
  • 避免先将字节数组转换为字符串。我试过这个,但它不起作用,我猜是因为它使用引用等于而不是比较字节。
  • 更快的散列算法?
  • 不同的流,或缓冲区大小?我试着做 1024^3 这应该是一个兆字节,但如果有的话,这似乎会减慢它的速度。

  • 或者这只是一个本质上很慢的事情?

    我记得 Dictionaries 可以接受 IEqualityComparer ,所以我可以编写自己的 byte[] 比较器。

    我在互联网上找到的很多算法倾向于先比较字节长度,我不需要这样做,因为我知道它总是 16 个字节。他们也倾向于一次比较 1 个字节……但我在 64 位机器上,那么为什么不做 8 个呢?
    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Security.Cryptography;
    
    namespace DupeFinder
    {
        class Program
        {
            static void Main(string[] args)
            {
                Console.WriteLine("Getting files...");
                string dir = @"D:\Photos";
                var files = Directory.GetFiles(dir, "*", SearchOption.AllDirectories);
                var alg = new HMACMD5();
                var dict = new Dictionary<byte[], List<string>>(new Md5Comparer());
                Console.WriteLine("Computing hashes...");
                int i = 0;
                int cursorLeft = Console.CursorLeft;
                int cursorTop = Console.CursorTop;
                foreach (var fileName in files)
                {
                    Console.SetCursorPosition(cursorLeft, cursorTop);
                    Console.Write("Hashing file {0}/{1}", ++i, files.Length);
                    using (var stream = new BufferedStream(File.OpenRead(fileName), 1024 * 1024 * 5))
                    {
                        var hash = alg.ComputeHash(stream);
                        if (!dict.ContainsKey(hash)) dict[hash] = new List<string>();
                        dict[hash].Add(fileName);
                    }
                }
                Console.WriteLine();
    
                using (var sw = new StreamWriter(Path.Combine(dir, "duplicates.txt")))
                {
                    i = 0;
                    foreach (var dupe in dict.Where(p => p.Value.Count >= 2))
                    {
                        sw.WriteLine("Duplicate {0}", ++i);
                        foreach(var fn in dupe.Value)
                        {
                            sw.WriteLine("- {0}", fn);
                        }
                    }
                }
    
                Console.WriteLine("Done!");
                //Console.ReadLine();
            }
        }
    
        class Md5Comparer : IEqualityComparer<byte[]>
        {
            public bool Equals(byte[] x, byte[] y)
            {
                var xi = BitConverter.ToInt64(x, 0);
                var yi = BitConverter.ToInt64(y, 0);
                if (xi != yi) return false;
                xi = BitConverter.ToInt64(x, 8);
                yi = BitConverter.ToInt64(y, 8);
                return xi == yi;
            }
    
            public int GetHashCode(byte[] obj)
            {
                return obj[0];
            }
        }
    }
    

    不知道这有多快......我没有做过任何基准测试,但它肯定看起来并没有变慢。

    新代码,感谢@spender:
    using System;
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Security.Cryptography;
    
    namespace DupeFinder
    {
        class Program
        {
            static void Main(string[] args)
            {
                var watch = Stopwatch.StartNew();
                const string dir = @"D:\Photos";
                var md5Comparer = new Md5Comparer();
    
                var dupeGroups = Directory.EnumerateFiles(dir, "*", SearchOption.AllDirectories)
                    .Select(fn => new FileInfo(fn))
                    .GroupBy(fi => fi.Length)
                    .Where(g => g.Count() > 1)
                    .SelectMany(g => g
                       .GroupBy(fi => GetHash(fi.FullName), md5Comparer)
                       .Where(g2 => g2.Count() > 1));
    
                using (var sw = new StreamWriter(Path.Combine(dir, "duplicates.txt")))
                {
                    int i = 0;
                    foreach (var dupeGroup in dupeGroups)
                    {
                        sw.WriteLine("Duplicate {0}", ++i);
                        foreach(FileInfo fi in dupeGroup)
                        {
                            sw.WriteLine("- {0}", fi.FullName);
                        }
                    }
                }
    
                Console.WriteLine("{0:0.000} seconds", watch.ElapsedMilliseconds / 1000d); // 22.068 seconds to process 10K files, 37 GB, 463 dupes
                Console.ReadLine();
            }
    
            static readonly HMACMD5 md5Hasher = new HMACMD5();
            public static byte[] GetHash(string fileName)
            {
                using(var stream = File.OpenRead(fileName))
                    return md5Hasher.ComputeHash(stream);
            }
        }
    
        class Md5Comparer : IEqualityComparer<byte[]>
        {
            public bool Equals(byte[] x, byte[] y)
            {
                var xi = BitConverter.ToInt64(x, 0);
                var yi = BitConverter.ToInt64(y, 0);
                if (xi != yi) return false;
                xi = BitConverter.ToInt64(x, 8);
                yi = BitConverter.ToInt64(y, 8);
                return xi == yi;
            }
    
            public int GetHashCode(byte[] obj)
            {
                return obj[0];
            }
        }
    }
    

    从 360+ 秒减少到 22-70 秒。相当的改进!

    最佳答案

    你错过了一个巨大的光:如果尺寸不匹配......

    此外,匹配哈希并不能保证匹配内容。

    编辑

    好的。找到基于尺寸的欺骗并不难,然后您可以检查它们是否真的是欺骗:

    例如:

    var files =
        Directory
            .GetFiles(
                @"C:\Users\Administrator\Downloads",
                "*",
                SearchOption.AllDirectories);
    var fileGroups=
        files
            .Select(fn => new FileInfo(fn))
            .GroupBy(fi => fi.Length)
            .Where(g => g.Count()>1);
    

    您可以通过为给定的文件名创建散列函数来更进一步
    int GetHash(fileName)  //fill in your definition
    

    然后...
    var fg2 =
            fileGroups
                .SelectMany(
                    g => g
                        .GroupBy(fi => GetHash(fi.FullName))
                        .Where(gg=>gg.Count()>1));
    
    foreach(var dupeGroup in fg2)
    {
        //everything in this group is probably duplicate
        //although you'd need to do byte by byte to be sure
        foreach(FileInfo dupe in dupeGroup)
        {
    
        }
    }
    

    通过这样做,您将大大减少所需的散列量,因为您已按大小对候选对象进行了预过滤。

    关于c# - 快速校验和散列?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/7936603/

    10-12 01:30
    查看更多