Round

implementation of Round class of the node component

Attributes

Classes

Round

Round(training_plan, training_plan_class, model_kwargs, training_kwargs, training, dataset, params, experiment_id, researcher_id, history_monitor, aggregator_args, node_args, round_number=0, dlp_and_loading_block_metadata=None, aux_vars=None)

This class represents the training part execute by a node in a given round

Parameters:

Name Type Description Default
training_plan str

code of the training plan for this round

required
training_plan_class str

class name of the training plan

required
model_kwargs dict

contains model args. Defaults to None.

required
training_kwargs dict

contains training arguments. Defaults to None.

required
training bool

whether to perform a model training or just to perform a validation check (model infering)

required
dataset dict

dataset details to use in this round. It contains the dataset name, dataset's id, data path, its shape, its description... . Defaults to None.

required
params str

parameters of the model

required
experiment_id str

experiment id

required
researcher_id str

researcher id

required
history_monitor HistoryMonitor

Sends real-time feed-back to end-user during training

required
aggregator_args Dict[str, Any]

Arguments managed by and shared with the researcher-side aggregator.

required
node_args Dict

command line arguments for node. Can include: - gpu (bool): propose use a GPU device if any is available. - gpu_num (Union[int, None]): if not None, use the specified GPU device instead of default GPU device if this GPU device is available. - gpu_only (bool): force use of a GPU device if any available, even if researcher doesn't request for using a GPU.

required
dlp_and_loading_block_metadata Optional[Tuple[dict, List[dict]]]

Data loading plan to apply, or None if no DLP for this round.

None
round_number int

number of the iteration for this experiment

0
aux_var

auxiliary variables of the model.

required
Source code in fedbiomed/node/round.py
def __init__(
    self,
    training_plan: str,
    training_plan_class: str,
    model_kwargs: dict,
    training_kwargs: dict,
    training: bool ,
    dataset: dict,
    params: str,
    experiment_id: str,
    researcher_id: str,
    history_monitor: HistoryMonitor,
    aggregator_args: Dict[str, Any],
    node_args: Dict,
    round_number: int = 0,
    dlp_and_loading_block_metadata: Optional[Tuple[dict, List[dict]]] = None,
    aux_vars: Optional[List[str]] = None,
) -> None:
    """Constructor of the class

    Args:
        training_plan: code of the training plan for this round
        training_plan_class: class name of the training plan
        model_kwargs: contains model args. Defaults to None.
        training_kwargs: contains training arguments. Defaults to None.
        training: whether to perform a model training or just to perform a validation check (model infering)
        dataset: dataset details to use in this round. It contains the dataset name, dataset's id,
            data path, its shape, its description... . Defaults to None.
        params: parameters of the model
        experiment_id: experiment id
        researcher_id: researcher id
        history_monitor: Sends real-time feed-back to end-user during training
        aggregator_args: Arguments managed by and shared with the
            researcher-side aggregator.
        node_args: command line arguments for node. Can include:
            - `gpu (bool)`: propose use a GPU device if any is available.
            - `gpu_num (Union[int, None])`: if not None, use the specified GPU device instead of default
                GPU device if this GPU device is available.
            - `gpu_only (bool)`: force use of a GPU device if any available, even if researcher
                doesn't request for using a GPU.
        dlp_and_loading_block_metadata: Data loading plan to apply, or None if no DLP for this round.
        round_number: number of the iteration for this experiment
        aux_var: auxiliary variables of the model.
    """

    self.dataset = dataset
    self.training_plan_source = training_plan
    self.training_plan_class = training_plan_class
    self.params = params
    self.experiment_id = experiment_id
    self.researcher_id = researcher_id
    self.history_monitor = history_monitor
    self.aggregator_args = aggregator_args
    self.aux_vars = aux_vars or []
    self.node_args = node_args
    self.training = training
    self._dlp_and_loading_block_metadata = dlp_and_loading_block_metadata
    self.training_kwargs = training_kwargs
    self.model_arguments = model_kwargs

    # Class attributes
    self.tp_security_manager = TrainingPlanSecurityManager()
    self.training_plan = None
    self.testing_arguments = None
    self.loader_arguments = None
    self.training_arguments = None
    self._secure_aggregation = None
    self._round = round_number
    self._node_state_manager: NodeStateManager = NodeStateManager(environ['DB_PATH'])

    self._keep_files_dir = tempfile.mkdtemp(prefix=environ['TMP_DIR'])

Attributes

aggregator_args instance-attribute
aggregator_args = aggregator_args
aux_vars instance-attribute
aux_vars = aux_vars or []
dataset instance-attribute
dataset = dataset
experiment_id instance-attribute
experiment_id = experiment_id
history_monitor instance-attribute
history_monitor = history_monitor
loader_arguments instance-attribute
loader_arguments = None
model_arguments instance-attribute
model_arguments = model_kwargs
node_args instance-attribute
node_args = node_args
params instance-attribute
params = params
researcher_id instance-attribute
researcher_id = researcher_id
testing_arguments instance-attribute
testing_arguments = None
tp_security_manager instance-attribute
tp_security_manager = TrainingPlanSecurityManager()
training instance-attribute
training = training
training_arguments instance-attribute
training_arguments = None
training_kwargs instance-attribute
training_kwargs = training_kwargs
training_plan instance-attribute
training_plan = None
training_plan_class instance-attribute
training_plan_class = training_plan_class
training_plan_source instance-attribute
training_plan_source = training_plan

Functions

collect_optim_aux_var
collect_optim_aux_var()

Collect auxiliary variables from the wrapped Optimizer, if any.

If the TrainingPlan does not use a Fed-BioMed Optimizer, return an empty dict. If it does not hold any BaseOptimizer however, raise a FedbiomedRoundError.

Returns:

Type Description
Dict[str, Any]

Auxiliary variables

Source code in fedbiomed/node/round.py
def collect_optim_aux_var(self) -> Dict[str, Any]:
    """Collect auxiliary variables from the wrapped Optimizer, if any.

    If the TrainingPlan does not use a Fed-BioMed Optimizer, return an
    empty dict. If it does not hold any BaseOptimizer however, raise a
    FedbiomedRoundError.

    Returns:
        Auxiliary variables
    """
    optimizer = self._get_base_optimizer()
    if isinstance(optimizer.optimizer, Optimizer):
        aux_var = optimizer.optimizer.get_aux()

        if aux_var and (self._secure_aggregation is None or self._secure_aggregation.use_secagg):
            # TODO: remove the following warning when secagg compatibility has been fixed
            # if secagg is used, raise a warning that encryption is not working with auxiliary variable
            logger.warning(f'Node {environ["NODE_ID"]} optimizer is sending auxiliary variables to the '
                           'Researcher, but those are not encrypted with SecAgg.'
                           'Auxiliary Variables may contain sensitive information about the Nodes.'
                           'This issue will be fixed in a future version of Fed-BioMed',
                           researcher_id=self.researcher_id)
        return aux_var
    return {}
initialize_arguments
initialize_arguments(previous_state_id=None)

Initializes arguments for training and testing and the NodeStateManager, the latter handling Node state loading and saving.

Parameters:

Name Type Description Default
previous_state_id Optional[str]

previous Node state id. Defaults to None (which is the state_id default value for the first Round).

None

Returns:

Type Description
Optional[Dict[str, Any]]

A dictionary containing the error message if an error is triggered while parsing training and testing

Optional[Dict[str, Any]]

arguments, None otherwise.

Source code in fedbiomed/node/round.py
def initialize_arguments(self,
                         previous_state_id: Optional[str] = None) -> Optional[Dict[str, Any]]:
    """Initializes arguments for training and testing and the NodeStateManager, the latter handling
    Node state loading and saving.

    Args:
        previous_state_id: previous Node state id. Defaults to None (which is the state_id default value for the first Round).

    Returns:
        A dictionary containing the error message if an error is triggered while parsing training and testing
        arguments, None otherwise.
    """
    # initialize Node State Manager
    self._node_state_manager.initialize(previous_state_id=previous_state_id,
                                        testing=not self.training)
    return self._initialize_validate_training_arguments()
process_optim_aux_var
process_optim_aux_var()

Process researcher-emitted Optimizer auxiliary variables, if any.

Returns:

Type Description
Optional[str]

Error message, empty if the operation was successful.

Source code in fedbiomed/node/round.py
def process_optim_aux_var(self) -> Optional[str]:
    """Process researcher-emitted Optimizer auxiliary variables, if any.

    Returns:
        Error message, empty if the operation was successful.
    """
    # Early-exit if there are no auxiliary variables to process.
    if not any(self.aux_vars):
        return


    aux_vars = {}
    aux_vars.update(self.aux_vars[0])
    aux_vars.update(self.aux_vars[1])

    # Fetch the training plan's BaseOptimizer.
    try:
        optimizer = self._get_base_optimizer()
    except FedbiomedRoundError as exc:
        return str(exc)

    # Verify that the BaseOptimizer wraps an Optimizer.
    if not isinstance(optimizer.optimizer, Optimizer):
        return (
            "Received Optimizer auxiliary variables, but the "
            "TrainingPlan does not manage a compatible Optimizer."
        )
    # Pass auxiliary variables to the Optimizer.
    try:
        optimizer.optimizer.set_aux(aux_vars)
    except FedbiomedOptimizerError as exc:
        return (
            "TrainingPlan Optimizer failed to ingest the provided "
            f"auxiliary variables: {repr(exc)}"
        )

    return
run_model_training
run_model_training(secagg_arguments=None)

Runs one round of model training

Parameters:

Name Type Description Default
secagg_arguments Union[Dict, None]

arguments for secure aggregation, some are specific to the scheme

None

Returns:

Type Description
Optional[Dict[str, Any]]

Returns the corresponding node message, training reply instance

Source code in fedbiomed/node/round.py
def run_model_training(
        self,
        secagg_arguments: Union[Dict, None] = None,
) -> Optional[Dict[str, Any]]:
    """Runs one round of model training

    Args:
        secagg_arguments: arguments for secure aggregation, some are specific to the scheme

    Returns:
        Returns the corresponding node message, training reply instance
    """
    # Validate secagg status. Raises error if the training request is not compatible with
    # secure aggregation settings
    try:
        self._secure_aggregation = SecaggRound(secagg_arguments, self.experiment_id)
    except FedbiomedSecureAggregationError as e:
        logger.error(str(e))
        return self._send_round_reply(
            success=False,
            message='Could not configure secure aggregation on node')

    # Validate and load training plan
    if environ["TRAINING_PLAN_APPROVAL"]:
        approved, training_plan_ = self.tp_security_manager.\
            check_training_plan_status(
                self.training_plan_source,
                TrainingPlanApprovalStatus.APPROVED)

        if not approved:
            return self._send_round_reply(
                False,
                f'Requested training plan is not approved by the node: {environ["NODE_ID"]}')
        else:
            logger.info(f'Training plan has been approved by the node {training_plan_["name"]}',
                        researcher_id=self.researcher_id)

    # Import training plan, save to file, reload, instantiate a training plan
    try:
        CurrentTPModule, CurrentTrainingPlan = utils.import_class_from_spec(
            code=self.training_plan_source, class_name=self.training_plan_class)
        self.training_plan = CurrentTrainingPlan()
    except Exception:
        error_message = "Cannot instantiate training plan object."
        return self._send_round_reply(success=False, message=error_message)

    # save and load training plan to a file to be sure
    # 1. a file is associated to training plan so we can read its source, etc.
    # 2. all dependencies are applied
    training_plan_module = 'model_' + str(uuid.uuid4())
    training_plan_file = os.path.join(self._keep_files_dir, training_plan_module + '.py')
    try:
        self.training_plan.save_code(training_plan_file, from_code=self.training_plan_source)
    except Exception as e:
        error_message = "Cannot save the training plan to a local tmp dir"
        logger.error(f"Cannot save the training plan to a local tmp dir : {e}")
        return self._send_round_reply(success=False, message=error_message)

    del CurrentTrainingPlan
    del CurrentTPModule

    try:
        CurrentTPModule, self.training_plan = utils.import_class_object_from_file(
            training_plan_file, self.training_plan_class)
    except Exception:
        error_message = "Cannot load training plan object from file."
        return self._send_round_reply(success=False, message=error_message)

    try:
        self.training_plan.post_init(model_args=self.model_arguments,
                                     training_args=self.training_arguments,
                                     aggregator_args=self.aggregator_args)
    except Exception:
        error_message = "Can't initialize training plan with the arguments."
        return self._send_round_reply(success=False, message=error_message)

    # load node state
    previous_state_id = self._node_state_manager.previous_state_id
    if previous_state_id is not None:
        try:
            self._load_round_state(previous_state_id)
        except Exception:
            # don't send error details
            return self._send_round_reply(success=False, message="Can't read previous node state.")

    # Load model parameters received from researcher
    try:
        self.training_plan.set_model_params(self.params)
    except Exception:
        error_message = "Cannot initialize model parameters."
        return self._send_round_reply(success=False, message=error_message)
    # ---------------------------------------------------------------------

    # Process Optimizer auxiliary variables, if any.
    error_message = self.process_optim_aux_var()
    if error_message:
        return self._send_round_reply(success=False, message=error_message)

    # Split training and validation data -------------------------------------
    try:

        self._set_training_testing_data_loaders()
    except FedbiomedError as fe:
        error_message = f"Can not create validation/train data: {repr(fe)}"
        return self._send_round_reply(success=False, message=error_message)
    except Exception as e:
        error_message = f"Undetermined error while creating data for training/validation. Can not create " \
                        f"validation/train data: {repr(e)}"
        return self._send_round_reply(success=False, message=error_message)
    # ------------------------------------------------------------------------


    # Validation Before Training
    if self.testing_arguments.get('test_on_global_updates', False) is not False:

        # Last control to make sure validation data loader is set.
        if self.training_plan.testing_data_loader is not None:
            try:
                self.training_plan.testing_routine(metric=self.testing_arguments.get('test_metric', None),
                                                   metric_args=self.testing_arguments.get('test_metric_args', {}),
                                                   history_monitor=self.history_monitor,
                                                   before_train=True)
            except FedbiomedError as e:
                logger.error(f"{ErrorNumbers.FB314}: During the validation phase on global parameter updates; "
                             f"{repr(e)}", researcher_id=self.researcher_id)
            except Exception as e:
                logger.error(f"Undetermined error during the testing phase on global parameter updates: "
                             f"{repr(e)}", researcher_id=self.researcher_id)
        else:
            logger.error(f"{ErrorNumbers.FB314}: Can not execute validation routine due to missing testing dataset"
                         f"Please make sure that `test_ratio` has been set correctly",
                         researcher_id=self.researcher_id)

    # If training is activated.
    if self.training:
        results = {}

        # Perform the training round.
        if self.training_plan.training_data_loader is not None:
            try:
                rtime_before = time.perf_counter()
                ptime_before = time.process_time()
                self.training_plan.training_routine(history_monitor=self.history_monitor,
                                                    node_args=self.node_args)
                rtime_after = time.perf_counter()
                ptime_after = time.process_time()
            except Exception as exc:
                error_message = f"Cannot train model in round: {repr(exc)}"
                return self._send_round_reply(success=False, message=error_message)

        # Collect Optimizer auxiliary variables, if any.
        try:
            results['optim_aux_var'] = self.collect_optim_aux_var()
        except (FedbiomedOptimizerError, FedbiomedRoundError) as exc:
            error_message = f"Cannot collect Optimizer auxiliary variables: {repr(exc)}"
            return self._send_round_reply(success=False, message=error_message)

        # Validation after training
        if self.testing_arguments.get('test_on_local_updates', False) is not False:

            if self.training_plan.testing_data_loader is not None:
                try:
                    self.training_plan.testing_routine(metric=self.testing_arguments.get('test_metric', None),
                                                       metric_args=self.testing_arguments.get('test_metric_args',
                                                                                              {}),
                                                       history_monitor=self.history_monitor,
                                                       before_train=False)
                except FedbiomedError as e:
                    logger.error(
                        f"{ErrorNumbers.FB314.value}: During the validation phase on local parameter updates; "
                        f"{repr(e)}", researcher_id=self.researcher_id)
                except Exception as e:
                    logger.error(f"Undetermined error during the validation phase on local parameter updates"
                                 f"{repr(e)}", researcher_id=self.researcher_id)
            else:
                logger.error(
                    f"{ErrorNumbers.FB314.value}: Can not execute validation routine due to missing testing "
                    f"dataset please make sure that test_ratio has been set correctly",
                    researcher_id=self.researcher_id)

        # FIXME: this will fail if `self.training_plan.training_data_loader = None` (see issue )
        results["sample_size"] = len(self.training_plan.training_data_loader.dataset)

        results["encrypted"] = False
        model_weights = self.training_plan.after_training_params(flatten=self._secure_aggregation.use_secagg)

        if self._secure_aggregation.use_secagg:

            logger.info("Encrypting model parameters. This process can take some time depending on model size.",
                        researcher_id=self.researcher_id)

            model_weights = self._secure_aggregation.scheme.encrypt(
                params=model_weights,
                current_round=self._round,
                weight=results['sample_size'],
            )

            results["encrypted"] = True
            results["encryption_factor"] = None
            if self._secure_aggregation.scheme.secagg_random is not None and environ['SECAGG_INSECURE_VALIDATION']:
                results["encryption_factor"] = self._secure_aggregation.scheme.encrypt(
                    params=[self._secure_aggregation.scheme.secagg_random],
                    current_round=self._round,
                    weight=results['sample_size'])
            logger.info("Encryption is completed!",
                        researcher_id=self.researcher_id)
        results['params'] = model_weights
        results['optimizer_args'] = self.training_plan.optimizer_args()
        results['state_id'] = self._node_state_manager.state_id

        try:
            self._save_round_state()
        except Exception:
            # don't send details to researcher
            return self._send_round_reply(success=False, message="Can't save new node state.")

        # end : clean the namespace
        try:
            del self.training_plan
            del CurrentTPModule
        except Exception:
            logger.debug('Exception raised while deleting training plan instance')

        return self._send_round_reply(success=True,
                                      timing={'rtime_training': rtime_after - rtime_before,
                                              'ptime_training': ptime_after - ptime_before},
                                      extend_with=results)
    else:
        # Only for validation
        return self._send_round_reply(success=True)