更新数据分块

2025-03-12 09:38:47 +08:00 · 2025-03-12 09:38:47 +08:00 · cc8f070aaa
commit cc8f070aaa
parent 30eeff4b1d
2 changed files with 100 additions and 14 deletions
--- a/dataset/recover.py
+++ b/dataset/recover.py
@ -1,3 +1,4 @@
+from concurrent.futures import ThreadPoolExecutor
 import random
 import cv2
 import numpy as np
@ -106,15 +107,15 @@ def process_images(input_folder, background_image_path, output_base):
    递归处理所有子文件夹并保持目录结构
    """
    # 预处理背景路径（只需执行一次）
-    if os.path.isfile(background_image_path):
-        background_paths = [background_image_path]
-    else:
-        valid_ext = ['.jpg', '.jpeg', '.png', '.bmp', '.webp']
-        background_paths = [
-            os.path.join(background_image_path, f) 
-            for f in os.listdir(background_image_path)
-            if os.path.splitext(f)[1].lower() in valid_ext
-        ]
+    # if os.path.isfile(background_image_path):
+    #     background_paths = [background_image_path]
+    # else:
+    #     valid_ext = ['.jpg', '.jpeg', '.png', '.bmp', '.webp']
+    #     background_paths = [
+    #         os.path.join(background_image_path, f) 
+    #         for f in os.listdir(background_image_path)
+    #         if os.path.splitext(f)[1].lower() in valid_ext
+    #     ]
    
    # 递归遍历输入目录
    for root, dirs, files in os.walk(input_folder):
@ -136,10 +137,10 @@ def process_images(input_folder, background_image_path, output_base):

            try:
                # 去背景处理
-                foreground = remove_background(input_path)
+                result = remove_background(input_path)
                
           
-                result = edge_fill2(foreground)
+                # result = edge_fill2(result)
                
                # 保存结果
                cv2.imwrite(output_path, result)
@ -148,10 +149,53 @@ def process_images(input_folder, background_image_path, output_base):
            except Exception as e:
                print(f"Error processing {input_path}: {str(e)}")
                
+def process_single_file(input_path, output_path):
+    """处理单个文件的独立函数"""
+    try:
+        result = remove_background(input_path)
+        # result = edge_fill2(result)  # 按需启用
+        cv2.imwrite(output_path, result)
+        print(f"Processed: {input_path} -> {output_path}")
+    except Exception as e:
+        print(f"Error processing {input_path}: {str(e)}")
+
+def process_imageswithpool(input_folder, background_image_path, output_base):
+    """
+    多线程版本的处理函数
+    使用ThreadPoolExecutor并行处理文件
+    """
+    with ThreadPoolExecutor(max_workers=os.cpu_count()*2) as executor:
+        futures = []
+        for root, dirs, files in os.walk(input_folder):
+            # 创建输出目录（主线程保证目录创建顺序）
+            relative_path = os.path.relpath(root, input_folder)
+            output_dir = os.path.join(output_base, relative_path)
+            os.makedirs(output_dir, exist_ok=True)
+
+            # 提交任务到线程池
+            for filename in files:
+                if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
+                    continue
+                
+                input_path = os.path.join(root, filename)
+                output_path = os.path.join(output_dir, filename)
+                futures.append(executor.submit(
+                    process_single_file, 
+                    input_path,
+                    output_path
+                ))
+
+        # 可选：等待所有任务完成并处理异常
+        for future in futures:
+            try:
+                future.result()
+            except Exception as e:
+                print(f"Unhandled error in thread: {str(e)}")
+

 # 使用示例
-input_directory = 'L:/Tobacco/2023_JY/20230821/SOURCE'
+input_directory = 'L:/Grade_datasets/JY_A'
 background_image_path = 'F:/dataset/02.TA_EC/rundata/BACKGROUND/ZY_B'
-output_directory = 'L:/Test'
+output_directory = 'L:/Grade_datasets/MOVE_BACKGROUND'

-process_images(input_directory, background_image_path, output_directory)
+process_imageswithpool(input_directory, background_image_path, output_directory)
--- a/dataset/splitdataset.py
+++ b/dataset/splitdataset.py
@ -0,0 +1,42 @@
+import os
+import random
+import shutil
+
+def split_dataset(input_folder, output_folder):
+    # 创建输出子目录
+    subdatasets = [os.path.join(output_folder, f'subdataset_{i+1}') for i in range(3)]
+    for subdataset in subdatasets:
+        os.makedirs(subdataset, exist_ok=True)
+
+    # 遍历每个类别文件夹
+    for root, dirs, _ in os.walk(input_folder):
+        for category in dirs:
+            category_folder = os.path.join(root, category)
+            images = [os.path.join(category_folder, f) for f in os.listdir(category_folder)
+                      if f.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
+
+            # 如果图像数量少于 1200，进行补充
+            if len(images) < 1200:
+                images.extend(random.choices(images, k=1200 - len(images)))
+
+            # 打乱图像顺序
+            random.shuffle(images)
+
+            # 为每个子数据集创建类别子文件夹
+            for subdataset_path in subdatasets:
+                category_subfolder = os.path.join(subdataset_path, category)
+                os.makedirs(category_subfolder, exist_ok=True)
+
+            # 将图像分配到子数据集中，每个子数据集 400 张
+            for i, image_path in enumerate(images):
+                subdataset_index = i // 400
+                subdataset_path = subdatasets[subdataset_index]
+                category_subfolder = os.path.join(subdataset_path, category)
+                shutil.copy(image_path, category_subfolder)
+
+    print(f'Dataset split into 3 subdatasets with 400 images per category at {output_folder}')
+
+if __name__ == "__main__":
+    input_folder = 'L:/Grade_datasets/MOVE_BACKGROUND'
+    output_folder = 'L:/Grade_datasets/SPLIT'
+    split_dataset(input_folder, output_folder)