编写一个简单的程序可以在我的计算机上找到完全重复的文件,但速度有点慢。有什么办法可以加快速度吗?
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;
namespace DupeFinder
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Getting files...");
var files = Directory.GetFiles(@"D:\Photos", "*", SearchOption.AllDirectories);
var alg = new HMACMD5();
var dict = new Dictionary<string,List<string>>();
Console.WriteLine("Computing hashes...");
int i=0;
int cursorLeft = Console.CursorLeft;
int cursorTop = Console.CursorTop;
foreach(var fileName in files)
{
Console.SetCursorPosition(cursorLeft,cursorTop);
Console.Write("Hashing file {0}/{1}", ++i, files.Length);
using(var stream = new BufferedStream(File.OpenRead(fileName),1024*1024*5))
{
var hash = alg.ComputeHash(stream);
var str = BitConverter.ToString(hash);
if (!dict.ContainsKey(str)) dict[str] = new List<string>();
dict[str].Add(fileName);
}
}
Console.WriteLine();
foreach(var dupe in dict.Where(p => p.Value.Count >= 2))
{
Console.WriteLine(string.Join(", ", dupe.Value));
}
Console.WriteLine("Done!");
Console.ReadLine();
}
}
}
可能的优化:
或者这只是一个本质上很慢的事情?
我记得 Dictionaries 可以接受
IEqualityComparer
,所以我可以编写自己的 byte[]
比较器。我在互联网上找到的很多算法倾向于先比较字节长度,我不需要这样做,因为我知道它总是 16 个字节。他们也倾向于一次比较 1 个字节……但我在 64 位机器上,那么为什么不做 8 个呢?
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;
namespace DupeFinder
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Getting files...");
string dir = @"D:\Photos";
var files = Directory.GetFiles(dir, "*", SearchOption.AllDirectories);
var alg = new HMACMD5();
var dict = new Dictionary<byte[], List<string>>(new Md5Comparer());
Console.WriteLine("Computing hashes...");
int i = 0;
int cursorLeft = Console.CursorLeft;
int cursorTop = Console.CursorTop;
foreach (var fileName in files)
{
Console.SetCursorPosition(cursorLeft, cursorTop);
Console.Write("Hashing file {0}/{1}", ++i, files.Length);
using (var stream = new BufferedStream(File.OpenRead(fileName), 1024 * 1024 * 5))
{
var hash = alg.ComputeHash(stream);
if (!dict.ContainsKey(hash)) dict[hash] = new List<string>();
dict[hash].Add(fileName);
}
}
Console.WriteLine();
using (var sw = new StreamWriter(Path.Combine(dir, "duplicates.txt")))
{
i = 0;
foreach (var dupe in dict.Where(p => p.Value.Count >= 2))
{
sw.WriteLine("Duplicate {0}", ++i);
foreach(var fn in dupe.Value)
{
sw.WriteLine("- {0}", fn);
}
}
}
Console.WriteLine("Done!");
//Console.ReadLine();
}
}
class Md5Comparer : IEqualityComparer<byte[]>
{
public bool Equals(byte[] x, byte[] y)
{
var xi = BitConverter.ToInt64(x, 0);
var yi = BitConverter.ToInt64(y, 0);
if (xi != yi) return false;
xi = BitConverter.ToInt64(x, 8);
yi = BitConverter.ToInt64(y, 8);
return xi == yi;
}
public int GetHashCode(byte[] obj)
{
return obj[0];
}
}
}
不知道这有多快......我没有做过任何基准测试,但它肯定看起来并没有变慢。
新代码,感谢@spender:
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Security.Cryptography;
namespace DupeFinder
{
class Program
{
static void Main(string[] args)
{
var watch = Stopwatch.StartNew();
const string dir = @"D:\Photos";
var md5Comparer = new Md5Comparer();
var dupeGroups = Directory.EnumerateFiles(dir, "*", SearchOption.AllDirectories)
.Select(fn => new FileInfo(fn))
.GroupBy(fi => fi.Length)
.Where(g => g.Count() > 1)
.SelectMany(g => g
.GroupBy(fi => GetHash(fi.FullName), md5Comparer)
.Where(g2 => g2.Count() > 1));
using (var sw = new StreamWriter(Path.Combine(dir, "duplicates.txt")))
{
int i = 0;
foreach (var dupeGroup in dupeGroups)
{
sw.WriteLine("Duplicate {0}", ++i);
foreach(FileInfo fi in dupeGroup)
{
sw.WriteLine("- {0}", fi.FullName);
}
}
}
Console.WriteLine("{0:0.000} seconds", watch.ElapsedMilliseconds / 1000d); // 22.068 seconds to process 10K files, 37 GB, 463 dupes
Console.ReadLine();
}
static readonly HMACMD5 md5Hasher = new HMACMD5();
public static byte[] GetHash(string fileName)
{
using(var stream = File.OpenRead(fileName))
return md5Hasher.ComputeHash(stream);
}
}
class Md5Comparer : IEqualityComparer<byte[]>
{
public bool Equals(byte[] x, byte[] y)
{
var xi = BitConverter.ToInt64(x, 0);
var yi = BitConverter.ToInt64(y, 0);
if (xi != yi) return false;
xi = BitConverter.ToInt64(x, 8);
yi = BitConverter.ToInt64(y, 8);
return xi == yi;
}
public int GetHashCode(byte[] obj)
{
return obj[0];
}
}
}
从 360+ 秒减少到 22-70 秒。相当的改进!
最佳答案
你错过了一个巨大的光:如果尺寸不匹配......
此外,匹配哈希并不能保证匹配内容。
编辑
好的。找到基于尺寸的欺骗并不难,然后您可以检查它们是否真的是欺骗:
例如:
var files =
Directory
.GetFiles(
@"C:\Users\Administrator\Downloads",
"*",
SearchOption.AllDirectories);
var fileGroups=
files
.Select(fn => new FileInfo(fn))
.GroupBy(fi => fi.Length)
.Where(g => g.Count()>1);
您可以通过为给定的文件名创建散列函数来更进一步
int GetHash(fileName) //fill in your definition
然后...
var fg2 =
fileGroups
.SelectMany(
g => g
.GroupBy(fi => GetHash(fi.FullName))
.Where(gg=>gg.Count()>1));
foreach(var dupeGroup in fg2)
{
//everything in this group is probably duplicate
//although you'd need to do byte by byte to be sure
foreach(FileInfo dupe in dupeGroup)
{
}
}
通过这样做,您将大大减少所需的散列量,因为您已按大小对候选对象进行了预过滤。
关于c# - 快速校验和散列?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/7936603/