Classes that simplify imports from fedbiomed.common._dataset_controller
Classes
Controller
Controller()
Bases: ABC, DataLoadingPlanMixin
Source code in fedbiomed/common/dataloadingplan/_data_loading_plan.py
def __init__(self):
self._dlp = None
Attributes
root property writable
root
Functions
get_sample abstractmethod
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_controller.py
@abstractmethod
def get_sample(self, index: int) -> Dict[str, Any]:
"""Retrieve a data sample without applying transforms"""
pass
get_types
get_types()
Get type directly from values in dict returned by get_sample
Source code in fedbiomed/common/dataset_controller/_controller.py
def get_types(self):
"""Get `type` directly from values in `dict` returned by `get_sample`"""
return {_k: type(_v).__name__ for _k, _v in self.get_sample(0).items()}
shape
shape()
Get shape directly from values in dict returned by get_sample
Source code in fedbiomed/common/dataset_controller/_controller.py
def shape(self) -> Dict[str, Any]:
"""Get `shape` directly from values in `dict` returned by `get_sample`"""
# Supported: int, float, dict, PIL.Image, None and obj.shape (if available)
# This function can be overwritten for specific cases in child class
sample = self.get_sample(0)
if not isinstance(sample, dict):
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Expected `sample` to be a `dict`, got "
f"{type(sample).__name__}"
)
output = {}
for key, val in sample.items():
if hasattr(val, "shape"):
output[key] = val.shape
elif isinstance(val, (int, float)):
output[key] = 1
elif isinstance(val, dict):
output[key] = len(val)
elif isinstance(val, Image):
output[key] = {"size": val.size, "mode": val.mode}
elif val is None:
output[key] = None
else:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Not possible to get shape for value "
f"of key: {key} that is type {type(val).__name__}"
)
return output
validate
validate()
Validates coherence of controller
Raises:
| Type | Description |
|---|---|
FedbiomedError | if coherence issue is found |
Source code in fedbiomed/common/dataset_controller/_controller.py
def validate(self) -> None:
"""Validates coherence of controller
Raises:
FedbiomedError: if coherence issue is found
"""
return None
CustomController
CustomController(root, **kwargs)
Bases: Controller
Custom dataset controller for MNIST dataset
Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def __init__(self, root, **kwargs):
self.root = root
self._controller_kwargs = {
"root": str(self.root),
**kwargs,
}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Shouldn't be called as the custom controller does not need to get a sample.
It is handled by get_item in CustomDataset.
Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def get_sample(self, index):
"""Shouldn't be called as the custom controller does not need to get a sample.
It is handled by get_item in CustomDataset.
"""
return None
get_types
get_types()
Controller does not know how to get samples.
Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def get_types(self):
"""Controller does not know how to get samples."""
pass
shape
shape()
Len and shape should never be used, as the dataset implements it specifically.
It is impossible for the custom controller to know the shape of the dataset.
Source code in fedbiomed/common/dataset_controller/_custom_controller.py
def shape(self):
"""Len and shape should never be used, as the dataset implements it specifically.
It is impossible for the custom controller to know the shape of the dataset.
"""
return None
ImageFolderController
ImageFolderController(root)
Bases: Controller
Generic ImageFolder where data is arranged like this: root ├── class_x │ ├── xxx.ext │ ├── xxy.ext │ └── ... ├── class_y │ ├── 123.ext │ └── ... └── ...
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Source code in fedbiomed/common/dataset_controller/_image_folder_controller.py
def __init__(self, root: Union[str, Path]) -> Dict[str, Any]:
"""Constructor of the class
Args:
root: Root directory path
Raises:
FedbiomedError: if `ImageFolder` can not be initialized
"""
self.root = root
try:
_, self._class_to_idx = folder.find_classes(directory=self.root)
self._samples = folder.make_dataset(
directory=self.root,
class_to_idx=self._class_to_idx,
extensions=self._extensions,
is_valid_file=self._is_valid_file,
allow_empty=self._allow_empty,
)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: "
"Failed to instantiate ImageFolderDataset object"
) from e
self._controller_kwargs = {
"root": str(self.root),
}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_image_folder_controller.py
def get_sample(self, index: int) -> Dict[str, Any]:
"""Retrieve a data sample without applying transforms"""
try:
path, target = self._samples[index]
data = self._loader(path)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
) from e
return {"data": data, "target": target}
MedNistController
MedNistController(root)
Bases: Controller
Generic data controller where the data is arranged in this way: root └── MedNIST ├── AbdomenCT │ ├── 000000.jpeg │ └── ... ├── BreastMRI/ ├── ChestCT/ ├── CXR/ ├── Hand/ └── HeadCT/
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Source code in fedbiomed/common/dataset_controller/_mednist_controller.py
def __init__(self, root: Union[str, Path]) -> Dict[str, Any]:
"""Constructor of the class
Args:
root: Root directory path
Raises:
FedbiomedError: if `ImageFolder` can not be initialized
"""
self.root = root
if not (self.root / "MedNIST").exists():
download_mednist(self.root)
try:
self._dataset = ImageFolder(self.root / "MedNIST")
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: "
f"The following error raised while loading the data folder: {e}"
) from e
self._controller_kwargs = {
"root": str(self.root),
}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_mednist_controller.py
def get_sample(self, index: int) -> Dict[str, Any]:
"""Retrieve a data sample without applying transforms"""
try:
data, target = self._dataset[index]
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
) from e
return {"data": data, "target": target}
MedicalFolderController
MedicalFolderController(root, tabular_file=None, index_col=None, dlp=None, validate=True)
Bases: Controller
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
tabular_file | Optional[Union[str, PathLike, Path]] | Path to CSV file containing the demographic information | None |
index_col | Optional[Union[int, str]] | Column name in tabular file containing the subjects names | None |
Raises:
| Type | Description |
|---|---|
FedbiomedError | |
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def __init__(
self,
root: Union[str, Path],
tabular_file: Optional[Union[str, PathLike, Path]] = None,
index_col: Optional[Union[int, str]] = None,
dlp: Optional[DataLoadingPlan] = None,
validate: bool = True,
):
"""Constructor for class `MedicalFolder`
Args:
root: Root directory path
tabular_file: Path to CSV file containing the demographic information
index_col: Column name in tabular file containing the subjects names
Raises:
FedbiomedError:
- if one in `tabular_file` and `index_col` is given and the other is not
"""
DataLoadingPlanMixin.__init__(self)
self.root = root
self._tabular_file = self._validate_tabular_file(tabular_file)
self._index_col = self._validate_index_col(index_col)
# Folder structure <subject>/<modality>/<file> in DataFrame format
self._df_dir = self._make_df_dir(root=self.root, extensions=self._extensions)
# Demographics
if (tabular_file is None) != (index_col is None):
raise FedbiomedError(
f"{ErrorNumbers.FB613.value}: "
"Arguments `tabular_file` and `index_col`, both or none are expected"
)
self._demographics = (
None
if tabular_file is None
else self.read_demographics(tabular_file, index_col)
)
# Data loading plan
if dlp is not None:
self.set_dlp(dlp)
# Function 'validate' instantiates self._controller_kwargs
if validate is True:
self.validate()
Attributes
demographics property
demographics
df_dir property
df_dir
Returns a copy to ensure _df_dir is not modified
index_col property
index_col
modalities property
modalities
Returns keys of dict that maps modalities to folders
root instance-attribute
root = root
subjects property
subjects
tabular_file property
tabular_file
Functions
available_subjects
available_subjects(subjects_from_index, subjects_from_folder=None)
Checks missing subject folders and missing entries in demographics
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
subjects_from_index | Union[list, Series] | Given subject folder names in demographics | required |
subjects_from_folder | list | list of subject folder names | None |
Returns:
| Type | Description |
|---|---|
Dict[str, str] | Dict with next keys: |
Dict[str, str] |
|
Dict[str, str] |
|
Dict[str, str] |
|
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def available_subjects(
self,
subjects_from_index: Union[list, pd.Series],
subjects_from_folder: list = None,
) -> Dict[str, str]:
"""Checks missing subject folders and missing entries in demographics
Args:
subjects_from_index: Given subject folder names in demographics
subjects_from_folder: list of subject folder names
Returns:
Dict with next keys:
- missing_folders: subjects in demographics absent in folder structure
- missing_entries: subjects in folder structure absent in demographics
- intersection: subjects present in folder structure and demographics
"""
# Select all subject folders if it is not given
if subjects_from_folder is None:
subjects_from_folder = self.subjects
return {
# Missing subject that will cause warnings
"missing_folders": list(
set(subjects_from_index).difference(subjects_from_folder)
),
# Missing entries that will cause errors
"missing_entries": list(
set(subjects_from_folder).difference(subjects_from_index)
),
# Intersection
"intersection": list(
set(subjects_from_index).intersection(subjects_from_folder)
),
}
demographics_column_names
demographics_column_names(path)
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def demographics_column_names(self, path: Union[str, Path]):
return self.read_demographics(path).columns.values
get_dataset_type staticmethod
get_dataset_type()
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
@staticmethod
def get_dataset_type() -> DatasetTypes:
return DatasetTypes.MEDICAL_FOLDER
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def get_sample(
self, index: int
) -> Dict[str, NiftiReader.data_type | Dict[str, Any]]:
"""Retrieve a data sample without applying transforms"""
if self._validated is False:
self.validate()
sample = self._samples[index]
try:
data = {
modality: NiftiReader.read(sample[modality])
for modality in self.modalities
}
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
) from e
if "demographics" in sample:
data["demographics"] = sample["demographics"]
return data
read_demographics staticmethod
read_demographics(tabular_file, index_col=None)
Read demographics tabular file
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
tabular_file | Union[str, Path] | path to demographics file | required |
index_col | Optional[Union[int, str]] | Index column that matches | None |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if the file can not be loaded |
Returns:
| Type | Description |
|---|---|
DataFrame | Demographics in DataFrame format |
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
@staticmethod
def read_demographics(
tabular_file: Union[str, Path],
index_col: Optional[Union[int, str]] = None,
) -> pd.DataFrame:
"""Read demographics tabular file
Args:
tabular_file: path to demographics file
index_col: Index column that matches <subject>. Defaults to None.
Raises:
FedbiomedError: if the file can not be loaded
Returns:
Demographics in DataFrame format
"""
tabular_file = MedicalFolderController._validate_tabular_file(tabular_file)
try:
demographics = CsvReader(tabular_file).data.to_pandas()
if index_col is not None:
if isinstance(index_col, int):
if index_col < 0 or index_col >= len(demographics.columns):
raise FedbiomedError(
f"{ErrorNumbers.FB613.value}: "
f"Index column {index_col} is out of bounds"
)
index_col = demographics.columns[index_col]
demographics = demographics.set_index(index_col)
except FedbiomedError as e:
raise FedbiomedError(
f"{ErrorNumbers.FB613.value}: :"
f"Can not load demographics tabular file. Error message is: {e}"
) from e
length = len(demographics)
logger.info(f"Number of rows in demographics file: {length}")
# Keep the first one in duplicated subjects
demographics = demographics.loc[~demographics.index.duplicated(keep="first")]
if length != len(demographics):
logger.info(f"Length of demographics for unique index {len(demographics)}")
return demographics
set_dlp
set_dlp(dlp)
Ensures validation of controller object after set_dlp is executed
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def set_dlp(self, dlp):
"""Ensures validation of controller object after set_dlp is executed"""
DataLoadingPlanMixin.set_dlp(self, dlp)
self.validate()
subject_modality_status
subject_modality_status(index=None)
Scans subjects and checks which modalities exist for each subject
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
index | Union[list, Series] | Array-like index that comes from reference csv file. It represents subject folder names. Defaults to None. | None |
Returns: Modality status that indicates which modalities are available per subject
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def subject_modality_status(self, index: Union[list, pd.Series] = None) -> Dict:
"""Scans subjects and checks which modalities exist for each subject
Args:
index: Array-like index that comes from reference csv file.
It represents subject folder names. Defaults to None.
Returns:
Modality status that indicates which modalities are available per subject
"""
# Pivot into wide format with boolean indicators
df_ = (
self.df_dir.assign(val=True)
.pivot_table(
index="subject",
columns="modality",
values="val",
fill_value=0,
)
.astype(bool)
)
if index is not None:
df_["in_folder"] = True
# Merge with pivot (outer join)
df_ = pd.merge(
df_,
pd.DataFrame(True, index=index, columns=["in_index"]),
left_index=True,
right_index=True,
how="outer",
)
# Fill missing values with False
df_ = df_.fillna(False)
return {
"columns": df_.columns.tolist(),
"data": df_.values.tolist(),
"index": df_.index.tolist(),
}
validate
validate()
Source code in fedbiomed/common/dataset_controller/_medical_folder_controller.py
def validate(self) -> None:
# Filter subjects to contain all modalities
self._modalities, df_dir = self._prepare_df_dir_for_use(self.df_dir, self._dlp)
# Generate list of samples: dict, with demographics and path to modalities
self._subjects, self._samples = self._make_dataset(self.demographics, df_dir)
# Change flag to be able to recover a sample
self._validated = True
# Check if is possible to use `reader` to recover a valid item
_ = self.get_sample(0)
self._controller_kwargs = {
"root": str(self.root),
"tabular_file": (
None if self.tabular_file is None else str(self.tabular_file)
),
"index_col": self.index_col,
"dlp": self._dlp,
}
MedicalFolderLoadingBlockTypes
MedicalFolderLoadingBlockTypes(*args)
Bases: DataLoadingBlockTypes, Enum
Source code in fedbiomed/common/constants.py
def __init__(self, *args):
cls = self.__class__
if not isinstance(self.value, str):
raise ValueError(
"all fields of DataLoadingBlockTypes subclasses must be of str type"
)
if any(self.value == e.value for e in cls):
a = self.name
e = cls(self.value).name
raise ValueError(
f"duplicate values not allowed in DataLoadingBlockTypes and "
f"its subclasses: {a} --> {e}"
)
Attributes
MODALITIES_TO_FOLDERS class-attribute instance-attribute
MODALITIES_TO_FOLDERS = 'modalities_to_folders'
MnistController
MnistController(root, train=True, download=True)
Bases: Controller
Generic Mnist controller where the data is arranged in this way: root └──MNIST └── raw ├── train-images-idx3-ubyte ├── train-labels-idx1-ubyte ├── t10k-images-idx3-ubyte └── t10k-labels-idx1-ubyte
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
train | bool | If true then train files are used | True |
download | bool | If true then downloads and extracts the files if they do not exist | True |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Source code in fedbiomed/common/dataset_controller/_mnist_controller.py
def __init__(
self,
root: Union[str, Path],
train: bool = True,
download: bool = True,
) -> Dict[str, Any]:
"""Constructor of the class
Args:
root: Root directory path
train: If true then train files are used
download: If true then downloads and extracts the files if they do not exist
Raises:
FedbiomedError: if `torchvision.datasets.MNIST` can not be initialized
"""
self.root = root
try:
self._dataset = MNIST(root=self.root, train=train, download=download)
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: "
"Failed to instantiate MnistDataset object. {e}"
) from e
self._controller_kwargs = {
"root": str(self.root),
"train": train,
"download": False,
}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_mnist_controller.py
def get_sample(self, index: int) -> Dict[str, Any]:
"""Retrieve a data sample without applying transforms"""
try:
data, target = self._dataset[index]
except Exception as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
) from e
return {"data": data, "target": target}
TabularController
TabularController(root)
Bases: Controller
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
root | Union[str, Path] | Root directory path | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def __init__(
self,
root: Union[str, Path],
) -> None:
"""Constructor of the class
Args:
root: Root directory path
Raises:
FedbiomedError: if `root` does not exist
"""
self.root = root
self._reader = CsvReader(self.root)
self._controller_kwargs = {
"root": str(self.root),
}
Attributes
root instance-attribute
root = root
Functions
get_sample
get_sample(index)
Retrieve a data sample without applying transforms
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def get_sample(self, index: int) -> pl.DataFrame:
"""Retrieve a data sample without applying transforms"""
if index >= self.__len__():
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Failed to retrieve item at index {index}"
)
return self._reader.get(index)
get_types
get_types()
Get dtypes of the columns in the Tabular dataset
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def get_types(self):
"""Get dtypes of the columns in the Tabular dataset"""
return {
col: dtype.__class__.__name__
for col, dtype in self._reader.data.schema.items()
}
normalize_columns
normalize_columns(columns)
Validate and normalize columns to a list of column names
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
columns | Union[Iterable, int, str] | Columns to normalize | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | if |
Returns:
| Type | Description |
|---|---|
list[str] | List of column names |
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def normalize_columns(self, columns: Union[Iterable, int, str]) -> list[str]:
"""Validate and normalize `columns` to a list of column names
Args:
columns: Columns to normalize
Raises:
FedbiomedError: if `columns` is not valid
Returns:
List of column names
"""
return self._reader.normalize_columns(columns=columns)
shape
shape()
Source code in fedbiomed/common/dataset_controller/_tabular_controller.py
def shape(self) -> Dict:
return self._reader.shape()