MergeAllManualRaters.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Fri Nov 17 11:34:11 2023
  4. @author: arefks
  5. """
  6. import os
  7. import glob
  8. import pandas as pd
  9. from PIL import Image
  10. import numpy as np
  11. import concurrent.futures
  12. # Step 1: Define the starting path and file pattern
  13. start_path = r"C:\Users\aswen\Desktop\Code\Validation3"
  14. file_pattern_all_images = os.path.join(start_path, "*", "manual_slice_inspection", "*.png")
  15. manual_slice_inspection_image_files = glob.glob(file_pattern_all_images, recursive=True)
  16. file_pattern_all_raters = os.path.join(start_path, "*", "validation*", "*.png")
  17. validators_image_files = glob.glob(file_pattern_all_raters, recursive=True)
  18. # Step 2: Create DataFrames
  19. column_names = ["Path", "dataset_name", "validator_name"]
  20. manual_slice_inspection_df = pd.DataFrame(
  21. [
  22. [file_path, path_elements[-3] if len(path_elements) >= 3 else None, path_elements[-2] if len(path_elements) >= 2 else None]
  23. for file_path in manual_slice_inspection_image_files
  24. for path_elements in [file_path.split(os.sep)]
  25. ],
  26. columns=column_names,
  27. )
  28. validators_df = pd.DataFrame(
  29. [
  30. [file_path, path_elements[-3] if len(path_elements) >= 3 else None, path_elements[-2] if len(path_elements) >= 2 else None]
  31. for file_path in validators_image_files
  32. for path_elements in [file_path.split(os.sep)]
  33. ],
  34. columns=column_names,
  35. )
  36. # Get unique dataset names and validators
  37. unique_datasets = manual_slice_inspection_df["dataset_name"].unique()
  38. def process_dataset(dataset_name):
  39. ma_subset = manual_slice_inspection_df[manual_slice_inspection_df["dataset_name"] == dataset_name]
  40. va_subset = validators_df[validators_df["dataset_name"] == dataset_name]
  41. results = []
  42. for index, ma_row in ma_subset.iterrows():
  43. ma_path = ma_row["Path"]
  44. ma_image = Image.open(ma_path).convert('RGB')
  45. ma_image_array = np.array(ma_image)
  46. result_row = {"Path": ma_path}
  47. is_same_all = []
  48. for _, va_row in va_subset.iterrows():
  49. va_path = va_row["Path"]
  50. va_image = Image.open(va_path).convert('RGB')
  51. va_image_array = np.array(va_image)
  52. diff_image = abs(ma_image_array - va_image_array)
  53. threshold = 1e-6 # You can adjust this threshold based on your needs
  54. is_same = np.sum(diff_image) < threshold
  55. is_same_all.append(is_same)
  56. result_row[va_row["validator_name"]] = is_same
  57. result_row["dataset_name"] = dataset_name
  58. result_row["SequenceType"] = ma_path.split(os.sep)[-1].split("_")[0]
  59. validator_names = va_subset[is_same_all].validator_name
  60. for v in validator_names:
  61. result_row[v] = True
  62. results.append(result_row)
  63. #print(C)
  64. print("finished: " + dataset_name)
  65. return results
  66. # Get the number of available CPUs
  67. num_cpus = os.cpu_count()
  68. # Create a ThreadPoolExecutor
  69. with concurrent.futures.ThreadPoolExecutor(max_workers=num_cpus) as executor:
  70. # Process datasets in parallel
  71. all_results = list(executor.map(process_dataset, unique_datasets))
  72. # Concatenate the results into the final DataFrame
  73. result_df = pd.concat([pd.DataFrame(results) for results in all_results], ignore_index=True)
  74. # Fill NaN values with False
  75. result_df = result_df.fillna(False)
  76. # Optionally, you can save the combined DataFrame to a CSV file
  77. output_path = r"C:\Users\aswen\Desktop\Code\AIDAqc_Figures\input"
  78. result_df.to_csv(os.path.join(output_path, "combined_Human_Voters_from_votings3_Final.csv"), index=False)