diff --git a/dataset/recover.py b/dataset/recover.py index 58eb430..a3e5a06 100644 --- a/dataset/recover.py +++ b/dataset/recover.py @@ -1,3 +1,4 @@ +from concurrent.futures import ThreadPoolExecutor import random import cv2 import numpy as np @@ -106,15 +107,15 @@ def process_images(input_folder, background_image_path, output_base): 递归处理所有子文件夹并保持目录结构 """ # 预处理背景路径(只需执行一次) - if os.path.isfile(background_image_path): - background_paths = [background_image_path] - else: - valid_ext = ['.jpg', '.jpeg', '.png', '.bmp', '.webp'] - background_paths = [ - os.path.join(background_image_path, f) - for f in os.listdir(background_image_path) - if os.path.splitext(f)[1].lower() in valid_ext - ] + # if os.path.isfile(background_image_path): + # background_paths = [background_image_path] + # else: + # valid_ext = ['.jpg', '.jpeg', '.png', '.bmp', '.webp'] + # background_paths = [ + # os.path.join(background_image_path, f) + # for f in os.listdir(background_image_path) + # if os.path.splitext(f)[1].lower() in valid_ext + # ] # 递归遍历输入目录 for root, dirs, files in os.walk(input_folder): @@ -136,10 +137,10 @@ def process_images(input_folder, background_image_path, output_base): try: # 去背景处理 - foreground = remove_background(input_path) + result = remove_background(input_path) - result = edge_fill2(foreground) + # result = edge_fill2(result) # 保存结果 cv2.imwrite(output_path, result) @@ -147,11 +148,54 @@ def process_images(input_folder, background_image_path, output_base): except Exception as e: print(f"Error processing {input_path}: {str(e)}") + +def process_single_file(input_path, output_path): + """处理单个文件的独立函数""" + try: + result = remove_background(input_path) + # result = edge_fill2(result) # 按需启用 + cv2.imwrite(output_path, result) + print(f"Processed: {input_path} -> {output_path}") + except Exception as e: + print(f"Error processing {input_path}: {str(e)}") + +def process_imageswithpool(input_folder, background_image_path, output_base): + """ + 多线程版本的处理函数 + 使用ThreadPoolExecutor并行处理文件 + """ + with ThreadPoolExecutor(max_workers=os.cpu_count()*2) as executor: + futures = [] + for root, dirs, files in os.walk(input_folder): + # 创建输出目录(主线程保证目录创建顺序) + relative_path = os.path.relpath(root, input_folder) + output_dir = os.path.join(output_base, relative_path) + os.makedirs(output_dir, exist_ok=True) + + # 提交任务到线程池 + for filename in files: + if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')): + continue + + input_path = os.path.join(root, filename) + output_path = os.path.join(output_dir, filename) + futures.append(executor.submit( + process_single_file, + input_path, + output_path + )) + + # 可选:等待所有任务完成并处理异常 + for future in futures: + try: + future.result() + except Exception as e: + print(f"Unhandled error in thread: {str(e)}") # 使用示例 -input_directory = 'L:/Tobacco/2023_JY/20230821/SOURCE' +input_directory = 'L:/Grade_datasets/JY_A' background_image_path = 'F:/dataset/02.TA_EC/rundata/BACKGROUND/ZY_B' -output_directory = 'L:/Test' +output_directory = 'L:/Grade_datasets/MOVE_BACKGROUND' -process_images(input_directory, background_image_path, output_directory) \ No newline at end of file +process_imageswithpool(input_directory, background_image_path, output_directory) \ No newline at end of file diff --git a/dataset/splitdataset.py b/dataset/splitdataset.py new file mode 100644 index 0000000..eceb8b4 --- /dev/null +++ b/dataset/splitdataset.py @@ -0,0 +1,42 @@ +import os +import random +import shutil + +def split_dataset(input_folder, output_folder): + # 创建输出子目录 + subdatasets = [os.path.join(output_folder, f'subdataset_{i+1}') for i in range(3)] + for subdataset in subdatasets: + os.makedirs(subdataset, exist_ok=True) + + # 遍历每个类别文件夹 + for root, dirs, _ in os.walk(input_folder): + for category in dirs: + category_folder = os.path.join(root, category) + images = [os.path.join(category_folder, f) for f in os.listdir(category_folder) + if f.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))] + + # 如果图像数量少于 1200,进行补充 + if len(images) < 1200: + images.extend(random.choices(images, k=1200 - len(images))) + + # 打乱图像顺序 + random.shuffle(images) + + # 为每个子数据集创建类别子文件夹 + for subdataset_path in subdatasets: + category_subfolder = os.path.join(subdataset_path, category) + os.makedirs(category_subfolder, exist_ok=True) + + # 将图像分配到子数据集中,每个子数据集 400 张 + for i, image_path in enumerate(images): + subdataset_index = i // 400 + subdataset_path = subdatasets[subdataset_index] + category_subfolder = os.path.join(subdataset_path, category) + shutil.copy(image_path, category_subfolder) + + print(f'Dataset split into 3 subdatasets with 400 images per category at {output_folder}') + +if __name__ == "__main__": + input_folder = 'L:/Grade_datasets/MOVE_BACKGROUND' + output_folder = 'L:/Grade_datasets/SPLIT' + split_dataset(input_folder, output_folder) \ No newline at end of file