Classes that simplify imports from fedbiomed.node.dataset_manager
Classes
DatasetManager
DatasetManager(path, min_samples=0)
Interfaces with the node component database.
Facility for storing data, retrieving data and getting data info for the node. Currently uses TinyDB.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path | str | Path to the database file. | required |
min_samples | int | Minimum number of samples required when adding a dataset. Defaults to 0 (no minimum enforced). | 0 |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def __init__(self, path: str, min_samples: int = 0):
"""Initialize with database path.
Args:
path: Path to the database file.
min_samples: Minimum number of samples required when adding a dataset.
Defaults to 0 (no minimum enforced).
"""
self._min_samples = min_samples
self._dataset_table = DatasetTable(path)
self._dynamic_dataset_table = DynamicDatasetTable(path)
self._dlp_table = DlpTable(path)
self._dlb_table = DlbTable(path)
Attributes
dataset_table property
dataset_table
dlb_table property
dlb_table
dlp_table property
dlp_table
dynamic_dataset_table property
dynamic_dataset_table
Functions
add_database
add_database(name, data_type, tags, description, path, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)
Register a dataset in the database.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
name | str | Name of the dataset | required |
data_type | str | Type of the dataset (e.g. 'tabular', 'image-folder', 'medical-folder') | required |
tags | List[str] | Tags associated with the dataset | required |
description | str | Description of the dataset | required |
path | str | Path to the dataset | required |
dataset_id | Optional[str] | Optional ID for the dataset. If None, a new ID will be generated. | None |
dataset_parameters | Optional[dict] | Optional parameters for the dataset controller | None |
data_loading_plan | Optional[DataLoadingPlan] | Optional DataLoadingPlan associated with the dataset | None |
save_dlp | bool | Whether to save the DataLoadingPlan to the database if provided | True |
Returns:
| Type | Description |
|---|---|
| The dataset_id of the registered dataset |
Raises:
| Type | Description |
|---|---|
FedbiomedError | |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def add_database(
self,
name: str,
data_type: str,
tags: List[str],
description: str,
path: str,
dataset_id: Optional[str] = None,
dataset_parameters: Optional[dict] = None,
data_loading_plan: Optional[DataLoadingPlan] = None,
save_dlp: bool = True,
):
"""Register a dataset in the database.
Args:
name: Name of the dataset
data_type: Type of the dataset (e.g. 'tabular', 'image-folder', 'medical-folder')
tags: Tags associated with the dataset
description: Description of the dataset
path: Path to the dataset
dataset_id: Optional ID for the dataset. If None, a new ID will be generated.
dataset_parameters: Optional parameters for the dataset controller
data_loading_plan: Optional DataLoadingPlan associated with the dataset
save_dlp: Whether to save the DataLoadingPlan to the database if provided
Returns:
The dataset_id of the registered dataset
Raises:
FedbiomedError:
- If there are conflicting tags with existing datasets
- If the data loading plan name is invalid or not unique
- If the data_type is not supported
"""
entry = self._build_dataset_entry(
data_type=data_type,
path=path,
dataset_id=dataset_id,
dataset_parameters=dataset_parameters,
name=name,
tags=tags,
description=description,
data_loading_plan=data_loading_plan,
)
dataset_entry = self.dataset_table.insert(entry)
if save_dlp:
self.save_data_loading_plan(data_loading_plan)
return dataset_entry
add_dynamic_dataset
add_dynamic_dataset(path, researcher_id, experiment_id, processing_id, parent_dataset_id, name=None, tags=None, description=None, dataset_id=None, dataset_parameters=None)
Adds a dynamic dataset to the database.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path | str | the path where the dynamic dataset files are stored. | required |
researcher_id | str | the id of the researcher who created the dynamic dataset | required |
experiment_id | str | the id of the experiment for which the dynamic dataset was created | required |
processing_id | str | the id of the processing that generated the dynamic dataset | required |
parent_dataset_id | str | the id of the parent dataset from which the dynamic dataset was derived | required |
name | Optional[str] | optional name for the dynamic dataset | None |
tags | Optional[List[str]] | optional list of tags for the dynamic dataset | None |
description | Optional[str] | optional description for the dynamic dataset | None |
dataset_id | Optional[str] | optional id for the dynamic dataset. If None, a new id will be generated. | None |
dataset_parameters | Optional[dict] | optional parameters for the dataset controller, such as e.g. data type specific parameters (e.g. for medical-folder, the tabular file name within the folder) | None |
Returns:
| Type | Description |
|---|---|
| The dataset_id of the registered dynamic dataset |
Raises:
| Type | Description |
|---|---|
FedbiomedError | |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def add_dynamic_dataset(
self,
path: str,
researcher_id: str,
experiment_id: str,
processing_id: str,
parent_dataset_id: str,
name: Optional[str] = None,
tags: Optional[List[str]] = None,
description: Optional[str] = None,
dataset_id: Optional[str] = None,
dataset_parameters: Optional[dict] = None,
):
"""Adds a dynamic dataset to the database.
Args:
path: the path where the dynamic dataset files are stored.
researcher_id: the id of the researcher who created the dynamic dataset
experiment_id: the id of the experiment for which the dynamic dataset was created
processing_id: the id of the processing that generated the dynamic dataset
parent_dataset_id: the id of the parent dataset from which the dynamic dataset was derived
name: optional name for the dynamic dataset
tags: optional list of tags for the dynamic dataset
description: optional description for the dynamic dataset
dataset_id: optional id for the dynamic dataset. If None, a new id will be generated.
dataset_parameters: optional parameters for the dataset controller, such as e.g. data type specific parameters (e.g. for medical-folder, the tabular file name within the folder)
Returns:
The dataset_id of the registered dynamic dataset
Raises:
FedbiomedError:
- If the parent dataset is not found in the database
"""
# Validate parent
parent_entry, _ = self.get_dataset_entry_by_id(parent_dataset_id)
# Get data type from parent dataset
data_type = parent_entry["data_type"]
# Build entry
entry = self._build_dataset_entry(
data_type=data_type,
path=path,
dataset_id=dataset_id,
dataset_parameters=dataset_parameters,
name=name,
tags=tags,
description=description,
data_loading_plan=None, # DLP is not supported for dynamic datasets for now
extra_fields={
"researcher_id": researcher_id,
"experiment_id": experiment_id,
"processing_id": processing_id,
"parent_dataset_id": parent_dataset_id,
},
)
return self.dynamic_dataset_table.insert(entry)
delete_dataset_by_id
delete_dataset_by_id(dataset_id, recursive=False, reassign_children=False)
Deletes a dataset from the database by its ID.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset_id | str | the ID of the dataset to delete | required |
recursive | bool | whether to recursively delete all descendant dynamic datasets (if any). Defaults to False. | False |
reassign_children | bool | whether to delete the dataset, and reassign children to the parent dataset if the dataset has children. Defaults to False. | False |
Raises:
| Type | Description |
|---|---|
FedbiomedError | |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def delete_dataset_by_id(
self,
dataset_id: str,
recursive: bool = False,
reassign_children: bool = False,
) -> None:
"""Deletes a dataset from the database by its ID.
Args:
dataset_id: the ID of the dataset to delete
recursive: whether to recursively delete all descendant dynamic datasets (if any). Defaults to False.
reassign_children: whether to delete the dataset, and reassign children to the parent dataset if the dataset has children. Defaults to False.
Raises:
FedbiomedError:
- If no dataset is found with the given ID
- If the dataset is a raw dataset and has children, and reassign_children is True (to avoid reassigning children of a raw dataset)
- If the dataset is a raw dataset and has children, and recursive is not True
- If the dynamic dataset has children and neither recursive nor reassign_children is True
"""
dataset_entry, dataset_type = self.get_dataset_entry_by_id(dataset_id)
is_dynamic = dataset_type == self.dynamic_dataset_table._table_name
children = self.dynamic_dataset_table.get_all_by_value(
"parent_dataset_id", dataset_id
)
# Case 1: dataset
if not is_dynamic:
if children:
if reassign_children:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Cannot reassign children of a dataset with children. Use recursive=True to delete subtree."
)
if not recursive:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Dataset has derived dynamic datasets. Use recursive=True to delete subtree."
)
self._delete_dynamic_subtree(dataset_id)
# Delete raw dataset entry
self.dataset_table.delete_by_id(dataset_id)
return
# Case 2: dynamic dataset
if children and not recursive and not reassign_children:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Dataset has children. Use recursive=True to delete subtree "
"or reassign_children=True to reassign children."
)
if recursive:
self._delete_dynamic_subtree(dataset_id)
if reassign_children:
parent_id = dataset_entry.get("parent_dataset_id")
for child in children:
self.dynamic_dataset_table.update_by_id(
child["dataset_id"],
{"parent_dataset_id": parent_id},
)
self.dynamic_dataset_table.delete_by_id(dataset_id)
get_dataset_entry_by_id
get_dataset_entry_by_id(dataset_id)
Validates that the dataset exists and returns its DB entry and dataset type.
The dataset can be either
- a dataset (DatasetTable)
- a dynamic dataset (DynamicDatasetTable)
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dataset_id | str | the id of the dataset to validate | required |
Returns:
| Type | Description |
|---|---|
Tuple[dict, str] | Tuple of (entry dict, table name string) identifying where the record lives. |
Raises:
| Type | Description |
|---|---|
FedbiomedError | If dataset or dynamic dataset does not exist. |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def get_dataset_entry_by_id(self, dataset_id: str) -> Tuple[dict, str]:
"""
Validates that the dataset exists and returns its DB entry and dataset type.
The dataset can be either:
- a dataset (DatasetTable)
- a dynamic dataset (DynamicDatasetTable)
Args:
dataset_id: the id of the dataset to validate
Returns:
Tuple of (entry dict, table name string) identifying where the record lives.
Raises:
FedbiomedError: If dataset or dynamic dataset does not exist.
"""
dataset_entry = self.dataset_table.get_by_id(dataset_id)
if dataset_entry is not None:
return dataset_entry, self.dataset_table._table_name
dynamic_dataset_entry = self.dynamic_dataset_table.get_by_id(dataset_id)
if dynamic_dataset_entry is not None:
return dynamic_dataset_entry, self.dynamic_dataset_table._table_name
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Dataset with id {dataset_id} not found."
)
get_dlp_by_id
get_dlp_by_id(dlp_id)
Get a data loading plan and its associated data loading blocks by ID.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dlp_id | str | ID of the data loading plan to retrieve. | required |
Returns:
| Type | Description |
|---|---|
Optional[dict] | A tuple of (dlp_metadata, dlbs) where dlp_metadata is the DLP dict |
List[dict] | (or None if not found) and dlbs is a list of associated DLB dicts. |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def get_dlp_by_id(self, dlp_id: str) -> Tuple[Optional[dict], List[dict]]:
"""Get a data loading plan and its associated data loading blocks by ID.
Args:
dlp_id: ID of the data loading plan to retrieve.
Returns:
A tuple of (dlp_metadata, dlbs) where dlp_metadata is the DLP dict
(or None if not found) and dlbs is a list of associated DLB dicts.
"""
dlp_metadata = self.dlp_table.get_by_id(dlp_id)
if dlp_metadata is None:
logger.debug(
"Data loading plan with id %s not found in database.",
dlp_id,
)
return None, []
logger.debug(
"Data loading plan with id %s found in database. Retrieving associated data loading blocks.",
dlp_id,
)
dlb_ids = list(dlp_metadata["loading_blocks"].values())
logger.debug(
"Found %d data loading blocks associated with data loading plan id %s: dlb_ids=%s",
len(dlb_ids),
dlp_id,
dlb_ids,
)
return dlp_metadata, self.dlb_table.get_all_by_value("dlb_id", dlb_ids)
list_my_datasets
list_my_datasets(verbose=True)
Lists all datasets on the node.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
verbose | bool | Give verbose output. Defaults to True. | True |
Returns:
| Type | Description |
|---|---|
List[dict] | All datasets in the node's database. |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def list_my_datasets(self, verbose: bool = True) -> List[dict]:
"""Lists all datasets on the node.
Args:
verbose: Give verbose output. Defaults to True.
Returns:
All datasets in the node's database.
"""
my_data = self.dataset_table.all()
# Do not display dtypes
for doc in my_data:
doc.pop("dtypes", None)
if verbose:
print(tabulate(my_data, headers="keys"))
return my_data
obfuscate_private_information staticmethod
obfuscate_private_information(database_metadata)
Remove privacy-sensitive information, to prepare for sharing with a researcher.
Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to prevent sharing this information with a researcher through a reply message.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
database_metadata | Iterable[dict] | Iterable of metadata dicts, one per dataset. | required |
Returns:
| Type | Description |
|---|---|
Iterable[dict] | the updated iterable of metadata information objects without privacy-sensitive information |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
@staticmethod
def obfuscate_private_information(
database_metadata: Iterable[dict],
) -> Iterable[dict]:
"""Remove privacy-sensitive information, to prepare for sharing with a researcher.
Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to
prevent sharing this information with a researcher through a reply message.
Args:
database_metadata: Iterable of metadata dicts, one per dataset.
Returns:
the updated iterable of metadata information objects without privacy-sensitive information
"""
for d in database_metadata:
try:
# common obfuscations
d.pop("path", None)
# obfuscations specific for each data type
if "data_type" in d:
if d["data_type"] == "medical-folder":
if "dataset_parameters" in d:
d["dataset_parameters"].pop("tabular_file", None)
except AttributeError as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Object of type {type(d)} does not "
"support pop or getitem method in obfuscate_private_information."
) from e
return database_metadata
read_csv
read_csv(csv_file, index_col=None)
Gets content of a CSV file.
Reads a *.csv file and outputs its data into a pandas DataFrame. Finds automatically the CSV delimiter by parsing the first line.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
csv_file | str | File name / path | required |
index_col | Union[int, None] | Column that contains CSV file index. Defaults to None. | None |
Returns:
| Type | Description |
|---|---|
DataFrame | Pandas DataFrame with data contained in CSV file. |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def read_csv(
self, csv_file: str, index_col: Union[int, None] = None
) -> pd.DataFrame:
"""Gets content of a CSV file.
Reads a *.csv file and outputs its data into a pandas DataFrame.
Finds automatically the CSV delimiter by parsing the first line.
Args:
csv_file: File name / path
index_col: Column that contains CSV file index.
Defaults to None.
Returns:
Pandas DataFrame with data contained in CSV file.
"""
# Automatically identify separator and header
sniffer = csv.Sniffer()
with open(csv_file, "r") as file:
delimiter = sniffer.sniff(file.readline()).delimiter
file.seek(0)
header = 0 if sniffer.has_header(file.read()) else None
return pd.read_csv(csv_file, index_col=index_col, sep=delimiter, header=header)
remove_dlp_by_id
remove_dlp_by_id(dlp_id)
Remove a data loading plan (DLP) and its associated data loading blocks (DLBs).
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dlp_id | str | ID of the DataLoadingPlan to remove. | required |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def remove_dlp_by_id(self, dlp_id: str) -> None:
"""Remove a data loading plan (DLP) and its associated data loading blocks (DLBs).
Args:
dlp_id: ID of the DataLoadingPlan to remove.
"""
dlp, dlbs = self.get_dlp_by_id(dlp_id)
if dlp is not None:
self._dlp_table.delete_by_id(dlp_id)
for dlb in dlbs:
self._dlb_table.delete_by_id(dlb["dlb_id"])
save_data_loading_plan
save_data_loading_plan(data_loading_plan)
Save a DataLoadingPlan to the database.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data_loading_plan | Optional[DataLoadingPlan] | the DataLoadingPlan to be saved, or None. | required |
Returns:
| Type | Description |
|---|---|
Union[str, None] | The |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def save_data_loading_plan(
self, data_loading_plan: Optional[DataLoadingPlan]
) -> Union[str, None]:
"""Save a DataLoadingPlan to the database.
Args:
data_loading_plan: the DataLoadingPlan to be saved, or None.
Returns:
The `dlp_id` if a DLP was saved, otherwise None.
"""
if data_loading_plan is None:
return None
dlp_metadata, dlbs_metadata = data_loading_plan.serialize()
self.dlp_table.insert(dlp_metadata)
for dlb_metadata in dlbs_metadata:
self.dlb_table.insert(dlb_metadata)
return data_loading_plan.dlp_id
validate_samples
validate_samples(n_samples)
Raise FedbiomedError if n_samples is below the configured minimum.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
n_samples | int | Number of samples in the dataset to validate. | required |
Raises:
| Type | Description |
|---|---|
FedbiomedError | If n_samples is below the configured minimum. |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def validate_samples(self, n_samples: int) -> None:
"""Raise FedbiomedError if n_samples is below the configured minimum.
Args:
n_samples: Number of samples in the dataset to validate.
Raises:
FedbiomedError: If n_samples is below the configured minimum.
"""
if self._min_samples > 0 and n_samples < self._min_samples:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Dataset has {n_samples} samples, "
f"which is below the node's minimum required ({self._min_samples})."
)