Aswendt_Lab
/
2023_Kalantari_AIDAqc_old


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
							# -*- coding: utf-8 -*-
"""
Created on Fri Nov 17 11:34:11 2023

@author: arefks
"""
import os
import glob
import pandas as pd
from PIL import Image
import numpy as np
import concurrent.futures

# Step 1: Define the starting path and file pattern
start_path = r"C:\Users\aswen\Desktop\Code\Validation3"
file_pattern_all_images = os.path.join(start_path, "*", "manual_slice_inspection", "*.png")
manual_slice_inspection_image_files = glob.glob(file_pattern_all_images, recursive=True)

file_pattern_all_raters = os.path.join(start_path, "*", "validation*", "*.png")
validators_image_files = glob.glob(file_pattern_all_raters, recursive=True)

# Step 2: Create DataFrames
column_names = ["Path", "dataset_name", "validator_name"]

manual_slice_inspection_df = pd.DataFrame(
    [
        [file_path, path_elements[-3] if len(path_elements) >= 3 else None, path_elements[-2] if len(path_elements) >= 2 else None]
        for file_path in manual_slice_inspection_image_files
        for path_elements in [file_path.split(os.sep)]
    ],
    columns=column_names,
)

validators_df = pd.DataFrame(
    [
        [file_path, path_elements[-3] if len(path_elements) >= 3 else None, path_elements[-2] if len(path_elements) >= 2 else None]
        for file_path in validators_image_files
        for path_elements in [file_path.split(os.sep)]
    ],
    columns=column_names,
)

# Get unique dataset names and validators
unique_datasets = manual_slice_inspection_df["dataset_name"].unique()


def process_dataset(dataset_name):
    ma_subset = manual_slice_inspection_df[manual_slice_inspection_df["dataset_name"] == dataset_name]
    va_subset = validators_df[validators_df["dataset_name"] == dataset_name]

    results = []

    for index, ma_row in ma_subset.iterrows():
        ma_path = ma_row["Path"]
        ma_image = Image.open(ma_path).convert('RGB')
        ma_image_array = np.array(ma_image)

        result_row = {"Path": ma_path}
        is_same_all = []
        for _, va_row in va_subset.iterrows():
            va_path = va_row["Path"]
            va_image = Image.open(va_path).convert('RGB')
            va_image_array = np.array(va_image)

            diff_image = abs(ma_image_array - va_image_array)
            threshold = 1e-6  # You can adjust this threshold based on your needs
            is_same = np.sum(diff_image) < threshold
            is_same_all.append(is_same)

            result_row[va_row["validator_name"]] = is_same
            result_row["dataset_name"] = dataset_name
            result_row["SequenceType"] = ma_path.split(os.sep)[-1].split("_")[0]

        validator_names = va_subset[is_same_all].validator_name
        for v in validator_names:
            result_row[v] = True
        results.append(result_row)
        #print(C)
    
    print("finished: " + dataset_name)
    return results

# Get the number of available CPUs
num_cpus = os.cpu_count()

# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=num_cpus) as executor:
    # Process datasets in parallel
    all_results = list(executor.map(process_dataset, unique_datasets))

# Concatenate the results into the final DataFrame
result_df = pd.concat([pd.DataFrame(results) for results in all_results], ignore_index=True)

# Fill NaN values with False
result_df = result_df.fillna(False)

# Optionally, you can save the combined DataFrame to a CSV file
output_path = r"C:\Users\aswen\Desktop\Code\AIDAqc_Figures\input"
result_df.to_csv(os.path.join(output_path, "combined_Human_Voters_from_votings3_Final.csv"), index=False)