Classes that simplify imports from fedbiomed.node.dataset_manager
Classes
DatasetManager
DatasetManager(path)
Interfaces with the node component database.
Facility for storing data, retrieving data and getting data info for the node. Currently uses TinyDB.
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def __init__(self, path: str):
"""Initialize with database path."""
self._dataset_table = DatasetTable(path)
self._dlp_table = DlpTable(path)
self._dlb_table = DlbTable(path)
Attributes
dataset_table property
dataset_table
dlb_table property
dlb_table
dlp_table property
dlp_table
Functions
add_database
add_database(name, data_type, tags, description, path, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)
Register a dataset in the database.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
name | str | Name of the dataset | required |
data_type | str | Type of the dataset (e.g. 'tabular', 'image-folder', 'medical-folder') | required |
tags | Union[tuple, list] | Tags associated with the dataset | required |
description | str | Description of the dataset | required |
path | str | Path to the dataset | required |
dataset_id | Optional[str] | Optional ID for the dataset. If None, a new ID will be generated. | None |
dataset_parameters | Optional[dict] | Optional parameters for the dataset controller | None |
data_loading_plan | Optional[DataLoadingPlan] | Optional DataLoadingPlan associated with the dataset | None |
save_dlp | bool | Whether to save the DataLoadingPlan to the database if provided | True |
Returns:
| Type | Description |
|---|---|
| The dataset_id of the registered dataset |
Raises:
| Type | Description |
|---|---|
FedbiomedError | |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def add_database(
self,
name: str,
data_type: str,
tags: Union[tuple, list],
description: str,
path: str,
dataset_id: Optional[str] = None,
dataset_parameters: Optional[dict] = None,
data_loading_plan: Optional[DataLoadingPlan] = None,
save_dlp: bool = True,
):
"""Register a dataset in the database.
Args:
name: Name of the dataset
data_type: Type of the dataset (e.g. 'tabular', 'image-folder', 'medical-folder')
tags: Tags associated with the dataset
description: Description of the dataset
path: Path to the dataset
dataset_id: Optional ID for the dataset. If None, a new ID will be generated.
dataset_parameters: Optional parameters for the dataset controller
data_loading_plan: Optional DataLoadingPlan associated with the dataset
save_dlp: Whether to save the DataLoadingPlan to the database if provided
Returns:
The dataset_id of the registered dataset
Raises:
FedbiomedError:
- If there are conflicting tags with existing datasets
- If the data loading plan name is invalid or not unique
- If the data_type is not supported
"""
controller = get_controller(
data_type,
controller_parameters={
"root": path,
"dlp": data_loading_plan,
**(dataset_parameters if dataset_parameters is not None else {}),
},
)
dataset_entry = self.dataset_table.insert(
entry=dict(
dataset_id=dataset_id,
name=name,
data_type=data_type,
tags=tags,
description=description,
path=controller._controller_kwargs["root"],
shape=controller.shape(),
dtypes=controller.get_types(),
dataset_parameters={
_k: _v
for _k, _v in controller._controller_kwargs.items()
if _k not in ["root", "dlp"]
},
dlp_id=None if data_loading_plan is None else data_loading_plan.dlp_id,
)
)
if save_dlp:
self.save_data_loading_plan(data_loading_plan)
return dataset_entry
get_dlp_by_id
get_dlp_by_id(dlp_id)
Get data loading plan by ID and its associated data loading blocks.
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def get_dlp_by_id(self, dlp_id: str) -> Tuple[Optional[dict], List[dict]]:
"""Get data loading plan by ID and its associated data loading blocks."""
dlp_metadata = self.dlp_table.get_by_id(dlp_id)
if dlp_metadata is None:
return None, []
dlb_ids = list(dlp_metadata["loading_blocks"].values())
return dlp_metadata, self.dlb_table.get_all_by_value("dlb_id", dlb_ids)
list_my_datasets
list_my_datasets(verbose=True)
Lists all datasets on the node.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
verbose | bool | Give verbose output. Defaults to True. | True |
Returns:
| Type | Description |
|---|---|
List[dict] | All datasets in the node's database. |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def list_my_datasets(self, verbose: bool = True) -> List[dict]:
"""Lists all datasets on the node.
Args:
verbose: Give verbose output. Defaults to True.
Returns:
All datasets in the node's database.
"""
my_data = self.dataset_table.all()
# Do not display dtypes
for doc in my_data:
doc.pop("dtypes", None)
if verbose:
print(tabulate(my_data, headers="keys"))
return my_data
obfuscate_private_information staticmethod
obfuscate_private_information(database_metadata)
Remove privacy-sensitive information, to prepare for sharing with a researcher.
Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to prevent sharing this information with a researcher through a reply message.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
database_metadata | Iterable[dict] | an iterable of metadata information objects, one per dataset. Each metadata object should be in the format af key-value pairs, such as e.g. a dict. | required |
Returns:
| Type | Description |
|---|---|
Iterable[dict] | the updated iterable of metadata information objects without privacy-sensitive information |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
@staticmethod
def obfuscate_private_information(
database_metadata: Iterable[dict],
) -> Iterable[dict]:
"""Remove privacy-sensitive information, to prepare for sharing with a researcher.
Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to
prevent sharing this information with a researcher through a reply message.
Args:
database_metadata: an iterable of metadata information objects, one per dataset. Each metadata object
should be in the format af key-value pairs, such as e.g. a dict.
Returns:
the updated iterable of metadata information objects without privacy-sensitive information
"""
for d in database_metadata:
try:
# common obfuscations
d.pop("path", None)
# obfuscations specific for each data type
if "data_type" in d:
if d["data_type"] == "medical-folder":
if "dataset_parameters" in d:
d["dataset_parameters"].pop("tabular_file", None)
except AttributeError as e:
raise FedbiomedError(
f"{ErrorNumbers.FB632.value}: Object of type {type(d)} does not "
"support pop or getitem method in obfuscate_private_information."
) from e
return database_metadata
read_csv
read_csv(csv_file, index_col=None)
Gets content of a CSV file.
Reads a *.csv file and outputs its data into a pandas DataFrame. Finds automatically the CSV delimiter by parsing the first line.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
csv_file | str | File name / path | required |
index_col | Union[int, None] | Column that contains CSV file index. Defaults to None. | None |
Returns:
| Type | Description |
|---|---|
DataFrame | Pandas DataFrame with data contained in CSV file. |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def read_csv(
self, csv_file: str, index_col: Union[int, None] = None
) -> pd.DataFrame:
"""Gets content of a CSV file.
Reads a *.csv file and outputs its data into a pandas DataFrame.
Finds automatically the CSV delimiter by parsing the first line.
Args:
csv_file: File name / path
index_col: Column that contains CSV file index.
Defaults to None.
Returns:
Pandas DataFrame with data contained in CSV file.
"""
# Automatically identify separator and header
sniffer = csv.Sniffer()
with open(csv_file, "r") as file:
delimiter = sniffer.sniff(file.readline()).delimiter
file.seek(0)
header = 0 if sniffer.has_header(file.read()) else None
return pd.read_csv(csv_file, index_col=index_col, sep=delimiter, header=header)
remove_dlp_by_id
remove_dlp_by_id(dlp_id)
Removes a data loading plan (DLP) from the database, along with its associated data loading blocks (DLBs).
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
dlp_id | str | the DataLoadingPlan id | required |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def remove_dlp_by_id(self, dlp_id: str):
"""Removes a data loading plan (DLP) from the database,
along with its associated data loading blocks (DLBs).
Args:
dlp_id: the DataLoadingPlan id
"""
dlp, dlbs = self.get_dlp_by_id(dlp_id)
if dlp is not None:
self._dlp_table.delete_by_id(dlp_id)
for dlb in dlbs:
self._dlb_table.delete_by_id(dlb["dlb_id"])
save_data_loading_plan
save_data_loading_plan(data_loading_plan)
Save a DataLoadingPlan to the database.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data_loading_plan | Optional[DataLoadingPlan] | the DataLoadingPlan to be saved, or None. | required |
Returns:
| Type | Description |
|---|---|
Union[str, None] | The |
Source code in fedbiomed/node/dataset_manager/_dataset_manager.py
def save_data_loading_plan(
self, data_loading_plan: Optional[DataLoadingPlan]
) -> Union[str, None]:
"""Save a DataLoadingPlan to the database.
Args:
data_loading_plan: the DataLoadingPlan to be saved, or None.
Returns:
The `dlp_id` if a DLP was saved, otherwise None
"""
if data_loading_plan is None:
return None
dlp_metadata, dlbs_metadata = data_loading_plan.serialize()
_ = self.dlp_table.insert(dlp_metadata)
for dlb_metadata in dlbs_metadata:
_ = self.dlb_table.insert(dlb_metadata)
return data_loading_plan.dlp_id