In [ ]:
Copied!
import hashlib
import os
import requests
from tqdm import tqdm
import argparse
from zipfile import ZipFile
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
import hashlib import os import requests from tqdm import tqdm import argparse from zipfile import ZipFile import shutil import pandas as pd from sklearn.model_selection import train_test_split
In [ ]:
Copied!
from fedbiomed.node.config import node_component
from fedbiomed.node.config import node_component
In [ ]:
Copied!
def parse_args():
parser = argparse.ArgumentParser(description="IXI Sample downloader and splitter")
parser.add_argument("-f", "--root_folder", required=True, type=str)
parser.add_argument(
"-F",
"--force",
action=argparse.BooleanOptionalAction,
required=False,
type=bool,
default=False,
)
return parser.parse_args()
def parse_args(): parser = argparse.ArgumentParser(description="IXI Sample downloader and splitter") parser.add_argument("-f", "--root_folder", required=True, type=str) parser.add_argument( "-F", "--force", action=argparse.BooleanOptionalAction, required=False, type=bool, default=False, ) return parser.parse_args()
In [ ]:
Copied!
def has_correct_checksum_md5(filename, hash):
with open(filename, "rb") as f:
file_hash = hashlib.md5()
while chunk := f.read(8192):
file_hash.update(chunk)
return str(file_hash.hexdigest()) == hash
def has_correct_checksum_md5(filename, hash): with open(filename, "rb") as f: file_hash = hashlib.md5() while chunk := f.read(8192): file_hash.update(chunk) return str(file_hash.hexdigest()) == hash
In [ ]:
Copied!
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
print("Downloading file from:", url)
print("File will be saved as:", filename)
chunkSize = 1024
r = requests.get(url, stream=True)
with open(filename, "wb") as f:
pbar = tqdm(unit="B", total=int(r.headers["Content-Length"]))
for chunk in r.iter_content(chunk_size=chunkSize):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
return filename
def download_file(url, filename): """ Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`. """ print("Downloading file from:", url) print("File will be saved as:", filename) chunkSize = 1024 r = requests.get(url, stream=True) with open(filename, "wb") as f: pbar = tqdm(unit="B", total=int(r.headers["Content-Length"])) for chunk in r.iter_content(chunk_size=chunkSize): if chunk: # filter out keep-alive new chunks pbar.update(len(chunk)) f.write(chunk) return filename
In [ ]:
Copied!
def download_and_extract_ixi_sample(root_folder):
url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/7kd5wj7v7p-3.zip"
zip_filename = os.path.join(root_folder, "notebooks", "data", "7kd5wj7v7p-3.zip")
data_folder = os.path.join(root_folder, "notebooks", "data")
extracted_folder = os.path.join(data_folder, "7kd5wj7v7p-3", "IXI_sample")
# Extract if ZIP exists but not folder
if not os.path.exists(zip_filename):
# Download if it does not exist
download_file(url, zip_filename)
# Check if extracted folder exists
if os.path.isdir(extracted_folder):
print(f"Dataset folder already exists in {extracted_folder}")
return extracted_folder
assert has_correct_checksum_md5(zip_filename, "eecb83422a2685937a955251fa45cb03")
with ZipFile(zip_filename, "r") as zip_obj:
zip_obj.extractall(data_folder)
assert os.path.isdir(extracted_folder)
return extracted_folder
def download_and_extract_ixi_sample(root_folder): url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/7kd5wj7v7p-3.zip" zip_filename = os.path.join(root_folder, "notebooks", "data", "7kd5wj7v7p-3.zip") data_folder = os.path.join(root_folder, "notebooks", "data") extracted_folder = os.path.join(data_folder, "7kd5wj7v7p-3", "IXI_sample") # Extract if ZIP exists but not folder if not os.path.exists(zip_filename): # Download if it does not exist download_file(url, zip_filename) # Check if extracted folder exists if os.path.isdir(extracted_folder): print(f"Dataset folder already exists in {extracted_folder}") return extracted_folder assert has_correct_checksum_md5(zip_filename, "eecb83422a2685937a955251fa45cb03") with ZipFile(zip_filename, "r") as zip_obj: zip_obj.extractall(data_folder) assert os.path.isdir(extracted_folder) return extracted_folder
In [ ]:
Copied!
if __name__ == "__main__":
args = parse_args()
root_folder = os.path.abspath(os.path.expanduser(args.root_folder))
assert os.path.isdir(root_folder), f"Folder does not exist: {root_folder}"
# Centralized dataset
centralized_data_folder = download_and_extract_ixi_sample(root_folder)
# Federated Dataset
federated_data_folder = os.path.join(
root_folder, "notebooks", "data", "Hospital-Centers"
)
shutil.rmtree(federated_data_folder, ignore_errors=True)
csv_global = os.path.join(centralized_data_folder, "participants.csv")
allcenters = pd.read_csv(csv_global)
# Split centers
center_names = ["Guys", "HH", "IOP"]
center_dfs = list()
for center_name in center_names:
cfg_folder = os.path.join(args.root_folder, f"{center_name}")
os.makedirs(cfg_folder, exist_ok=True)
cfg_file = os.path.join(cfg_folder, f"{center_name.lower()}.ini")
print(f"Creating node at: {cfg_file}")
node_component.initiate()
if node_component.is_component_existing(root_folder):
print(f"**Warning: component {root_folder} already exists")
else:
node_component.initiate(root_folder)
df = allcenters[allcenters.SITE_NAME == center_name]
center_dfs.append(df)
train, test = train_test_split(df, test_size=0.1, random_state=21)
train_folder = os.path.join(federated_data_folder, center_name, "train")
holdout_folder = os.path.join(federated_data_folder, center_name, "holdout")
if not os.path.exists(train_folder):
os.makedirs(train_folder)
if not os.path.exists(holdout_folder):
os.makedirs(holdout_folder)
for subject_folder in train.FOLDER_NAME.values:
shutil.copytree(
src=os.path.join(centralized_data_folder, subject_folder),
dst=os.path.join(train_folder, subject_folder),
dirs_exist_ok=True,
)
train_participants_csv = os.path.join(train_folder, "participants.csv")
train.to_csv(train_participants_csv)
for subject_folder in test.FOLDER_NAME.values:
shutil.copytree(
src=os.path.join(centralized_data_folder, subject_folder),
dst=os.path.join(holdout_folder, subject_folder),
dirs_exist_ok=True,
)
test.to_csv(os.path.join(holdout_folder, "participants.csv"))
print(f"Centralized dataset located at: {centralized_data_folder}")
print(f"Federated dataset located at: {federated_data_folder}")
print()
print("Please add the data to your nodes executing and using the `ixi-train` tag:")
for center_name in center_names:
print(f"\tfedbiomed node --path ./{center_name.lower()} dataset add")
print()
print("Then start your nodes by executing:")
for center_name in center_names:
print(f"\tfedbiomed node --path ./{center_name.lower()} start")
if __name__ == "__main__": args = parse_args() root_folder = os.path.abspath(os.path.expanduser(args.root_folder)) assert os.path.isdir(root_folder), f"Folder does not exist: {root_folder}" # Centralized dataset centralized_data_folder = download_and_extract_ixi_sample(root_folder) # Federated Dataset federated_data_folder = os.path.join( root_folder, "notebooks", "data", "Hospital-Centers" ) shutil.rmtree(federated_data_folder, ignore_errors=True) csv_global = os.path.join(centralized_data_folder, "participants.csv") allcenters = pd.read_csv(csv_global) # Split centers center_names = ["Guys", "HH", "IOP"] center_dfs = list() for center_name in center_names: cfg_folder = os.path.join(args.root_folder, f"{center_name}") os.makedirs(cfg_folder, exist_ok=True) cfg_file = os.path.join(cfg_folder, f"{center_name.lower()}.ini") print(f"Creating node at: {cfg_file}") node_component.initiate() if node_component.is_component_existing(root_folder): print(f"**Warning: component {root_folder} already exists") else: node_component.initiate(root_folder) df = allcenters[allcenters.SITE_NAME == center_name] center_dfs.append(df) train, test = train_test_split(df, test_size=0.1, random_state=21) train_folder = os.path.join(federated_data_folder, center_name, "train") holdout_folder = os.path.join(federated_data_folder, center_name, "holdout") if not os.path.exists(train_folder): os.makedirs(train_folder) if not os.path.exists(holdout_folder): os.makedirs(holdout_folder) for subject_folder in train.FOLDER_NAME.values: shutil.copytree( src=os.path.join(centralized_data_folder, subject_folder), dst=os.path.join(train_folder, subject_folder), dirs_exist_ok=True, ) train_participants_csv = os.path.join(train_folder, "participants.csv") train.to_csv(train_participants_csv) for subject_folder in test.FOLDER_NAME.values: shutil.copytree( src=os.path.join(centralized_data_folder, subject_folder), dst=os.path.join(holdout_folder, subject_folder), dirs_exist_ok=True, ) test.to_csv(os.path.join(holdout_folder, "participants.csv")) print(f"Centralized dataset located at: {centralized_data_folder}") print(f"Federated dataset located at: {federated_data_folder}") print() print("Please add the data to your nodes executing and using the `ixi-train` tag:") for center_name in center_names: print(f"\tfedbiomed node --path ./{center_name.lower()} dataset add") print() print("Then start your nodes by executing:") for center_name in center_names: print(f"\tfedbiomed node --path ./{center_name.lower()} start")