删除MD5值相同的文件

通过MD5值把重复的文件移到del文件夹,表格记录。

import os
import hashlib
import shutil
import csv

def calculate_md5(file_path, chunk_size=4096):
    """计算单个文件的 MD5 值"""
    md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            md5.update(chunk)
    return md5.hexdigest()

def dedup_and_log(src_dir, del_dir,
                  all_md5_csv, dup_md5_csv):
    """
    1. 扫描 src_dir,计算每个文件的 MD5,写入 all_md5_csv
    2. 找到出现次数 >1 的 MD5,把这一组文件全部
       记录到 dup_md5_csv,并把“第二份及以后”移到 del_dir
    """
    # 确保 del 目录存在
    os.makedirs(del_dir, exist_ok=True)

    md5_to_paths = {}        # {md5: [path1, path2, ...]}
    # ------- 第一次遍历:收集信息并写全量表 -------
    with open(all_md5_csv, "w", newline='', encoding="utf-8-sig") as f_all:
        all_writer = csv.writer(f_all)
        all_writer.writerow(["File Path", "MD5"])

        for root, _, files in os.walk(src_dir):
            for name in files:
                path = os.path.join(root, name)
                file_md5 = calculate_md5(path)
                all_writer.writerow([path, file_md5])

                md5_to_paths.setdefault(file_md5, []).append(path)

    # ------- 第二步:输出重复表 & 移动重复 -------
    with open(dup_md5_csv, "w", newline='', encoding="utf-8-sig") as f_dup:
        dup_writer = csv.writer(f_dup)
        dup_writer.writerow(["File Path", "MD5"])

        for md5, paths in md5_to_paths.items():
            if len(paths) > 1:                 # 这一组有重复
                # 把所有路径都写进重复表
                for path in paths:
                    dup_writer.writerow([path, md5])

                # 把第二份及以后移到 del 目录
                for path in paths[1:]:
                    dst = os.path.join(del_dir, os.path.basename(path))
                    # 若同名冲突,给文件名加序号
                    i = 1
                    base, ext = os.path.splitext(dst)
                    while os.path.exists(dst):
                        dst = f"{base}_{i}{ext}"
                        i += 1
                    shutil.move(path, dst)
                    print(f"Moved duplicate: {path}  ->  {dst}")

# ---------- 用法示例 ----------
source_directory      = r"f:\xz\dy"
delete_directory      = r"f:\xz\del"

all_md5_csv_path      = r"f:\xz\all_files_md5.csv"
duplicates_md5_csv    = r"f:\xz\duplicates_md5.csv"

dedup_and_log(source_directory,
              delete_directory,
              all_md5_csv_path,
              duplicates_md5_csv)

MD5值命名文件

import os
import shutil
import hashlib
 
def calculate_md5(file_path):
    """计算文件的 MD5 值"""
    md5_hash = hashlib.md5()
    with open(file_path, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            md5_hash.update(chunk)
    return md5_hash.hexdigest()
 
def resolve_conflict(file_path):
    """解决文件名冲突,生成唯一文件名"""
    counter = 1
    base, ext = os.path.splitext(file_path)
    new_file_path = file_path
 
    # 如果文件名冲突,添加数字后缀
    while os.path.exists(new_file_path):
        new_file_path = f"{base}_{counter}{ext}"
        counter += 1
 
    return new_file_path
 
def organize_files_with_structure(root_folder, modified_root, log_txt):
    """重命名文件,保持原始文件目录结构"""
    with open(log_txt, 'w', encoding='utf-8') as log_file:
        for dirpath, _, filenames in os.walk(root_folder):
            for filename in filenames:
                old_file_path = os.path.join(dirpath, filename)
 
                if os.path.isfile(old_file_path):
                    # 计算 MD5 值并生成新文件路径
                    md5_name = calculate_md5(old_file_path)
                    extension = os.path.splitext(filename)[1]
                    # 创建与原始目录结构相同的路径
                    relative_path = os.path.relpath(dirpath, root_folder)
                    new_dir = os.path.join(modified_root, relative_path)
                    os.makedirs(new_dir, exist_ok=True)
                    new_file_path = os.path.join(new_dir, md5_name + extension)
 
                    # 解决文件名冲突
                    new_file_path = resolve_conflict(new_file_path)
 
                    # 复制文件到新路径
                    shutil.copy(old_file_path, new_file_path)
 
                    # 写入日志
                    log_file.write(f"{old_file_path} -> {new_file_path}\n")
                    print(f"处理文件: {old_file_path} -> {new_file_path}")
 
if __name__ == "__main__":
    # 指定原始文件夹路径
    folder_path = r"D:\T1"
    # 指定修改后文件夹根目录(可以单独设置)
    modified_root = r"D:\T1_modified"
    # 指定日志文件路径
    log_file = os.path.join(modified_root, "file_log.txt")
 
    if os.path.exists(folder_path):
        organize_files_with_structure(folder_path, modified_root, log_file)
        print(f"处理完成!日志已保存到 {log_file}")
        print(f"修改后文件存放在: {modified_root}")
    else:
        print("指定的文件夹不存在,请检查路径。")
❤️ 转载文章请注明出处,谢谢!❤️