From d05d893442dda3065acf154078f224112b3d2748 Mon Sep 17 00:00:00 2001 From: Rishi Sharma <rishi.sharma@epfl.ch> Date: Thu, 10 Feb 2022 12:42:07 +0100 Subject: [PATCH] Bugfix ChangeAccumulator --- eval/plot.py | 14 +++++++++++-- src/decentralizepy/node/Node.py | 9 +++++++- .../training/ChangeAccumulator.py | 21 ++++++++++++++++++- .../training/GradientAccumulator.py | 21 ++++++++++++++++++- src/decentralizepy/training/Training.py | 12 +++++++++++ 5 files changed, 72 insertions(+), 5 deletions(-) diff --git a/eval/plot.py b/eval/plot.py index d15db46..d3c3a39 100644 --- a/eval/plot.py +++ b/eval/plot.py @@ -71,11 +71,21 @@ def plot_results(path): plot(means, stdevs, mins, maxs, "Testing Accuracy", folder, "lower right") plt.figure(6) means, stdevs, mins, maxs = get_stats([x["grad_std"] for x in results]) - plot(means, stdevs, mins, maxs, "Gradient Variation over Nodes", folder, "upper right") + plot( + means, + stdevs, + mins, + maxs, + "Gradient Variation over Nodes", + folder, + "upper right", + ) # Plot Testing loss plt.figure(7) means, stdevs, mins, maxs = get_stats([x["grad_mean"] for x in results]) - plot(means, stdevs, mins, maxs, "Gradient Magnitude Mean", folder, "upper right") + plot( + means, stdevs, mins, maxs, "Gradient Magnitude Mean", folder, "upper right" + ) # Collect total_bytes shared bytes_list = [] for x in results: diff --git a/src/decentralizepy/node/Node.py b/src/decentralizepy/node/Node.py index 24c3168..e7aa916 100644 --- a/src/decentralizepy/node/Node.py +++ b/src/decentralizepy/node/Node.py @@ -177,7 +177,14 @@ class Node: ], ) self.trainer = train_class( - self.model, self.optimizer, self.loss, self.log_dir, **train_params + self.rank, + self.machine_id, + self.mapping, + self.model, + self.optimizer, + self.loss, + self.log_dir, + **train_params ) def init_comm(self, comm_configs): diff --git a/src/decentralizepy/training/ChangeAccumulator.py b/src/decentralizepy/training/ChangeAccumulator.py index cf9641f..2e10572 100644 --- a/src/decentralizepy/training/ChangeAccumulator.py +++ b/src/decentralizepy/training/ChangeAccumulator.py @@ -16,6 +16,9 @@ class ChangeAccumulator(Training): def __init__( self, + rank, + machine_id, + mapping, model, optimizer, loss, @@ -31,6 +34,12 @@ class ChangeAccumulator(Training): Parameters ---------- + rank : int + Rank of process local to the machine + machine_id : int + Machine ID on which the process in running + mapping : decentralizepy.mappings + The object containing the mapping rank <--> uid model : torch.nn.Module Neural Network for training optimizer : torch.optim @@ -52,7 +61,17 @@ class ChangeAccumulator(Training): """ super().__init__( - model, optimizer, loss, log_dir, rounds, full_epochs, batch_size, shuffle + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds, + full_epochs, + batch_size, + shuffle, ) self.save_accumulated = conditional_value(save_accumulated, "", True) self.communication_round = 0 diff --git a/src/decentralizepy/training/GradientAccumulator.py b/src/decentralizepy/training/GradientAccumulator.py index 3171019..fcff8e6 100644 --- a/src/decentralizepy/training/GradientAccumulator.py +++ b/src/decentralizepy/training/GradientAccumulator.py @@ -11,6 +11,9 @@ class GradientAccumulator(Training): def __init__( self, + rank, + machine_id, + mapping, model, optimizer, loss, @@ -25,6 +28,12 @@ class GradientAccumulator(Training): Parameters ---------- + rank : int + Rank of process local to the machine + machine_id : int + Machine ID on which the process in running + mapping : decentralizepy.mappings + The object containing the mapping rank <--> uid model : torch.nn.Module Neural Network for training optimizer : torch.optim @@ -44,7 +53,17 @@ class GradientAccumulator(Training): """ super().__init__( - model, optimizer, loss, log_dir, rounds, full_epochs, batch_size, shuffle + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds, + full_epochs, + batch_size, + shuffle, ) def trainstep(self, data, target): diff --git a/src/decentralizepy/training/Training.py b/src/decentralizepy/training/Training.py index 47c8f77..3b99bef 100644 --- a/src/decentralizepy/training/Training.py +++ b/src/decentralizepy/training/Training.py @@ -13,6 +13,9 @@ class Training: def __init__( self, + rank, + machine_id, + mapping, model, optimizer, loss, @@ -27,6 +30,12 @@ class Training: Parameters ---------- + rank : int + Rank of process local to the machine + machine_id : int + Machine ID on which the process in running + mapping : decentralizepy.mappings + The object containing the mapping rank <--> uid model : torch.nn.Module Neural Network for training optimizer : torch.optim @@ -49,6 +58,9 @@ class Training: self.optimizer = optimizer self.loss = loss self.log_dir = log_dir + self.rank = rank + self.machine_id = machine_id + self.mapping = mapping self.rounds = utils.conditional_value(rounds, "", int(1)) self.full_epochs = utils.conditional_value(full_epochs, "", False) self.batch_size = utils.conditional_value(batch_size, "", int(1)) -- GitLab