import torch
import torchvision
import os
from torch.utils.data import DataLoader, Dataset, TensorDataset
from PIL import Image, ImageOps
import numpy as np
import shutil
import random
from pathlib import Path
import albumentations as A
import cv2
from scipy.stats import norm, skewnorm
import skimage
def list_files_in_folder(image_folder):
"""Lists file names in a given directory"""
list_of_files = []
for file in os.listdir(image_folder):
if os.path.isfile(os.path.join(image_folder, file)):
list_of_files.append(file)
return list_of_files
def create_save_dir(direct, name_subdirectory):
if not os.path.exists(os.path.join(direct, name_subdirectory)):
print('make dir')
os.mkdir(os.path.join(direct, name_subdirectory))
return os.path.join(direct, name_subdirectory)
# Load and save data to train and test sets
root = './'
dataloader_train = torchvision.datasets.QMNIST(root, train=True, transform=None, target_transform=None, download=True)
dataloader_test = torchvision.datasets.QMNIST(root, train=False, transform=None, target_transform=None, download=True)
train_dir = create_save_dir('./', 'QMNIST_train')
for i, (image,target) in enumerate(dataloader_train):
image.save(os.path.join(train_dir,str(target)+'_'+str(i)+'_train'+'.jpg'), "JPEG")
test_dir = create_save_dir('./', 'QMNIST_test')
for i, (image,target) in enumerate(dataloader_test):
image.save(os.path.join(test_dir,str(target)+'_'+str(i)+'_test'+'.jpg'), "JPEG")
# Take 1/3 from original QMNIST test set for validation set
folder = './QMNIST_test'
valid_dir = create_save_dir('./', 'QMNIST_valid')
name_list = list_files_in_folder(folder)
if len(os.listdir(valid_dir))==0:
for digit in range(10):
count = len([elem for elem in name_list if elem[0]==str(digit)])
to_move = np.around(count/3)
count_moved=0
for i in range(len(name_list)):
if name_list[i][0]==str(digit):
if count_moved>=to_move:
break
src = os.path.join(folder, name_list[i])
dst = os.path.join(valid_dir, name_list[i])
shutil.move(src,dst)
count_moved+=1
key_instance_digit = '4'
# Create separate folder for key instances
def key_instances_to_folder(direct):
key_ins_dir = create_save_dir(direct, key_instance_digit)
list_img_names = list_files_in_folder(direct)
for i in range(len(list_img_names)):
if list_img_names[i][0]==key_instance_digit:
src = os.path.join(direct, list_img_names[i])
dst = os.path.join(key_ins_dir, list_img_names[i])
shutil.move(src, dst)
key_instances_to_folder(train_dir)
key_instances_to_folder(valid_dir)
key_instances_to_folder(test_dir)
# Fit for red channel distribution, used skewnorm.fit for a sample
a_est = 10.628
loc_est = 22.481
scale_est = 95.479
medianR = skewnorm.median(a_est, loc_est, scale_est)
Q025R = skewnorm.ppf(0.25, a_est, loc_est, scale_est)
# Fit for green channel, used normal.fit for a sample
muG = 124.643; stdG = 39.103
Q025G = skewnorm.ppf(0.25, muG, stdG)
# Fit for blue channel, used normal.fit for a sample
muB = 141.607; stdB = 32.383
Q025B = skewnorm.ppf(0.25, muB, stdB) #
transform = A.Compose([
A.Resize(height=80, width=80, interpolation=1, always_apply=False, p=1),
A.Rotate(limit=90, p=1.0),
A.GaussNoise(var_limit=(2.0, 5.0), p=0.1),
A.HorizontalFlip(p=0.5),
A.Blur(blur_limit=5, always_apply=False, p=0.15),
A.RandomBrightnessContrast(p=0.8),
A.augmentations.transforms.MotionBlur(blur_limit=(3,3), p=0.05),
A.augmentations.transforms.RandomFog(fog_coef_lower=0.3, fog_coef_upper=0.4, alpha_coef=0.08,
always_apply=False, p=0.05),
])
def auxiliary_function(count_bags, i_range_min, i_range_max, max_num_bags_of_type, dir_qmnist,
max_num_inst_of_type, img_names_list, save_f, bag_names):
count_instances = 0
count_bags_local = 0
for i in range(i_range_min, i_range_max):
if count_instances==max_num_inst_of_type[count_bags_local]:
count_bags+=1
count_bags_local+=1
list_sampled=[]
if count_bags>max_num_bags_of_type:
break
count_instances = 0
random_name = random.choice(img_names_list)
src = os.path.join(dir_qmnist, random_name)
bag_folder = create_save_dir(save_f, str(bag_names[count_bags_local]))
dst = os.path.join(bag_folder, random_name)
image = cv2.imread(src, cv2.IMREAD_GRAYSCALE)
image = cv2.bitwise_not(image)
image = cv2.cvtColor(image,cv2.COLOR_GRAY2RGB)
sampled_Rchannel = -1
sampled_Gchannel = -1
sampled_Bchannel = -1
while sampled_Rchannel<0 or sampled_Rchannel>255 or sampled_Rchannel>loc_est+Q025R or sampled_Rchannel<loc_est-Q025R:
sampled_Rchannel = skewnorm.rvs(a_est,loc_est, scale_est, size=1)
while sampled_Gchannel<0 or sampled_Gchannel>255 or sampled_Gchannel>muG+Q025G or sampled_Gchannel<muG-Q025G:
sampled_Gchannel = norm.rvs(muG, stdG, size=1)
while sampled_Bchannel<0 or sampled_Bchannel>255 or sampled_Bchannel>muB+Q025B or sampled_Bchannel<muB-Q025B:
sampled_Bchannel = norm.rvs(muB, stdB, size=1)
blank_image = np.zeros((28,28,3), np.uint8)
blank_image[:,:,0] = sampled_Bchannel
blank_image[:,:,1] = sampled_Gchannel
blank_image[:,:,2] = sampled_Rchannel
blend_image = cv2.addWeighted(image, 0.5, np.asarray(blank_image), 0.5, 0.0)
# Augment an image
transformed = transform(image=blend_image)
transformed_image = transformed["image"]
cv2.imwrite(dst+'_'+str(i).zfill(6)+'.jpg', transformed_image)
count_instances+=1
last_used_index_in_list = i
return last_used_index_in_list, count_bags
def num_key_ins(x):
return np.ceil((x*percent_key_instances)/100)
def compose_dataset(dir_qmnist, save_subfolder, percent_key_instances,
key_instance_digit, num_instances_in_bag_negative, num_instances_in_bag_positive,
bag_names_negative, bag_names_positive):
num_bags = len(num_instances_in_bag_negative)+len(num_instances_in_bag_positive)
save_f = create_save_dir(save_subfolder, 'positive')
img_names_list = list_files_in_folder(dir_qmnist)
random.shuffle(img_names_list)
''' Positive bags '''
# Negative instances in positive bags
num_key_ins_per_bag = num_key_ins(np.array([num_instances_in_bag_positive]))
max_num_inst_of_type = num_instances_in_bag_positive-num_key_ins_per_bag
max_num_bags_of_type = np.ceil(num_bags/2)-1
count_bags = 0; i_range_min=0; i_range_max=int(1e20)
last_used_index_in_list, _ = auxiliary_function(count_bags, i_range_min, i_range_max, max_num_bags_of_type,
dir_qmnist, max_num_inst_of_type[0], img_names_list, save_f,
bag_names_positive)
# Key instances in positive bags
key_instance_list = list_files_in_folder(os.path.join(dir_qmnist, str(key_instance_digit)))
count_bags = 0; i_range_min=0; i_range_max=int(1e20)
num_key_ins_per_bag = num_key_ins(np.array([num_instances_in_bag_positive]))
max_num_inst_of_type = num_key_ins_per_bag
max_num_bags_of_type = np.ceil(num_bags/2)-1
_, count_bags_pos = auxiliary_function(count_bags, i_range_min, i_range_max, max_num_bags_of_type,
os.path.join(dir_qmnist, str(key_instance_digit)), max_num_inst_of_type[0],
key_instance_list, save_f, bag_names_positive)
''' Negative bags '''
save_f = create_save_dir(save_subfolder, 'negative')
i_range_max = int(1e20)
count_bags = count_bags_pos
max_num_bags_of_type = count_bags_pos+np.ceil(num_bags/2)-1
_,_ = auxiliary_function(count_bags, last_used_index_in_list, i_range_max, max_num_bags_of_type,
dir_qmnist, num_instances_in_bag_negative, img_names_list, save_f,
bag_names_negative)
percent_key_instances = 10 # ex. 5%, 10%, 20%, percent of ley instances in positive bags
permutation = 1 # 1 permutation per fold used, meaning one sampling of images from QMNIST
key_instance_digit = int(key_instance_digit)
num_bags = 12 # Overall for training (num_bags/2 positive num_bags/2 negative)
# Bags (patient) names and number of images per bag (patient)
bag_names_train_negative = ['59', '63', '80', '68', '71', '75']
bag_names_train_positive = ['01', '05', '53', '86', '88', '98']
bag_names_test_negative = ['26', '61', '73', '70']
bag_names_test_positive = ['07', '101', '37', '96']
bag_names_valid_negative = ['78', '65']
bag_names_valid_positive = ['55', '03']
num_instances_in_bag_train_negative = [26484, 1418, 22948, 26816, 10333, 14714]
num_instances_in_bag_train_positive = [15408, 3257, 3788, 12294, 3927, 4559]
num_instances_in_bag_test_negative = [17147, 15875, 7707, 6400]
num_instances_in_bag_test_positive = [6953, 8451, 11712, 2691]
num_instances_in_bag_valid_negative = [2170, 44231]
num_instances_in_bag_valid_positive = [1657, 36899]
fold = 1
fold_dir = create_save_dir('./', 'fold'+str(fold))
save_folder = create_save_dir(fold_dir,'PAPQMNIST'+'_'+str(num_bags).zfill(4)+'_'+
'_'+str(percent_key_instances).zfill(4)+'_'+str(permutation))
# Train
dir_qmnist = './QMNIST_train/'
save_subfolder = create_save_dir(save_folder, 'train')
compose_dataset(dir_qmnist, save_subfolder, percent_key_instances,
key_instance_digit, num_instances_in_bag_train_negative, num_instances_in_bag_train_positive,
bag_names_train_negative, bag_names_train_positive)
# Validation
dir_qmnist = './QMNIST_valid/'
save_subfolder = create_save_dir(save_folder, 'valid')
compose_dataset(dir_qmnist, save_subfolder, percent_key_instances,
key_instance_digit, num_instances_in_bag_valid_negative, num_instances_in_bag_valid_positive,
bag_names_valid_negative, bag_names_valid_positive)
# Test set
dir_qmnist = './QMNIST_test/'
save_subfolder = create_save_dir(save_folder, 'test')
compose_dataset(dir_qmnist, save_subfolder, percent_key_instances,
key_instance_digit, num_instances_in_bag_test_negative, num_instances_in_bag_test_positive,
bag_names_test_negative, bag_names_test_positive)
# Create other folders by rearranging the bags/patients in train, valid and test forders, as for OC data
fold = 1
fold_dir = create_save_dir('./', 'fold'+str(fold))
src = os.path.join(fold_dir,'PAPQMNIST'+'_'+str(num_bags).zfill(4)+'_'+
'_'+str(percent_key_instances).zfill(4)+'_'+str(permutation))
for f in range(2,10):
fold_dir = os.path.join('./', 'fold'+str(f))
dst = os.path.join(fold_dir,'PAPQMNIST'+'_'+str(num_bags).zfill(4)+'_'+
'_'+str(percent_key_instances).zfill(4)+'_'+str(permutation))
shutil.copytree(src, dst)
path_data = dst
if f==2:
# Bag arrangement in 2 fold
train_bags_negative = ['80', '78', '65','70','61','75']
train_bags_positive = ['53', '55', '03','96','37','98']
test_bags_negative = ['59', '63','68','71']
test_bags_positive = ['01', '05','88','86']
valid_bags_negative = ['26','73']
valid_bags_positive = ['07','101']
elif f==3:
# Bag arrangement in 3 fold
train_bags_negative = ['70', '26', '73','71','63','61']
train_bags_positive = ['37', '07', '101','86','05','96']
valid_bags_negative = ['68','59']
valid_bags_positive = ['88','01']
test_bags_negative = ['65', '80','78','75']
test_bags_positive = ['03', '98','55','53']
elif f==4:
# Bag arrangement in 4 fold
train_bags_negative = ['80', '68', '71','75','73','70']
train_bags_positive = ['53', '86', '88','55','03','37']
valid_bags_negative = ['65','61']
valid_bags_positive = ['101','96']
test_bags_negative = ['26', '59','78','63']
test_bags_positive = ['07', '01','98','05']
elif f==5:
# Bag arrangement in 5 fold
train_bags_negative = ['59', '63', '78','65','61','70']
train_bags_positive = ['01', '05', '55','03','07','37']
valid_bags_negative = ['26','80']
valid_bags_positive = ['96','98']
test_bags_negative = ['73', '71','75','68']
test_bags_positive = ['101', '86','53','88']
elif f==6:
# Bag arrangement in 6 fold
train_bags_negative = ['63', '73', '68','75','78','26']
train_bags_positive = ['01', '05', '88','98','07','101']
valid_bags_negative = ['59','71']
valid_bags_positive = ['86','53']
test_bags_negative = ['61', '80','65','70']
test_bags_positive = ['37', '03','55','96']
elif f==7:
# Bag arrangement in 7 fold
train_bags_negative = ['80', '68', '78','26','73','70']
train_bags_positive = ['86', '98', '55','07','101','96']
valid_bags_negative = ['71','75']
valid_bags_positive = ['88','53']
test_bags_negative = ['65', '59','61','63']
test_bags_positive = ['03', '01','37','05']
elif f==8:
# Bag arrangement in 8 fold
train_bags_negative = ['59', '75', '78','65','26','73']
train_bags_positive = ['01', '53', '98','03','07','101']
valid_bags_negative = ['61','63']
valid_bags_positive = ['37','05']
test_bags_negative = ['80', '71','70','68']
test_bags_positive = ['55', '86','96','88']
elif f==9:
# Bag arrangement in 9 fold
train_bags_negative = ['59', '63', '68','71','65','61']
train_bags_positive = ['01', '86', '88','03','37','96']
valid_bags_negative = ['70','80']
valid_bags_positive = ['55','05']
test_bags_negative = ['78', '73','75','26']
test_bags_positive = ['98', '53','07','101']
# move all bags folders to common folder
basePath = Path(path_data)
for child in basePath.iterdir():
if child.is_dir() and (child.parts[-1]=='train' or child.parts[-1]=='valid' or child.parts[-1]=='test'):
for grandchild in child.iterdir():
if grandchild.is_dir():
for ggrandchild in grandchild.iterdir():
shutil.move(ggrandchild, os.path.join(basePath, ggrandchild.parts[-1]))
# move bags folders to corresponding subfolders
set_name = ['train', 'valid', 'test']
class_name = ['negative', 'positive']
for child in basePath.iterdir():
for se in set_name:
for cl in class_name:
if str(child.parts[-1]) in globals()[se+'_bags_'+cl]:
shutil.move(child, os.path.join(basePath, se, cl, str(child.parts[-1])))