diff --git a/eval/plot.py b/eval/plot.py index d15db4621eda3daa8629dc7bda989b4e32a2560a..d3c3a393fcd495ba2b5088b3df9fb8e5fb02253f 100644 --- a/eval/plot.py +++ b/eval/plot.py @@ -71,11 +71,21 @@ def plot_results(path): plot(means, stdevs, mins, maxs, "Testing Accuracy", folder, "lower right") plt.figure(6) means, stdevs, mins, maxs = get_stats([x["grad_std"] for x in results]) - plot(means, stdevs, mins, maxs, "Gradient Variation over Nodes", folder, "upper right") + plot( + means, + stdevs, + mins, + maxs, + "Gradient Variation over Nodes", + folder, + "upper right", + ) # Plot Testing loss plt.figure(7) means, stdevs, mins, maxs = get_stats([x["grad_mean"] for x in results]) - plot(means, stdevs, mins, maxs, "Gradient Magnitude Mean", folder, "upper right") + plot( + means, stdevs, mins, maxs, "Gradient Magnitude Mean", folder, "upper right" + ) # Collect total_bytes shared bytes_list = [] for x in results: diff --git a/src/decentralizepy/node/Node.py b/src/decentralizepy/node/Node.py index 24c316818feb5698531e90eb498ced15253e03f5..e7aa9163818947d3fe523a3cf1f630aaa0f0ef41 100644 --- a/src/decentralizepy/node/Node.py +++ b/src/decentralizepy/node/Node.py @@ -177,7 +177,14 @@ class Node: ], ) self.trainer = train_class( - self.model, self.optimizer, self.loss, self.log_dir, **train_params + self.rank, + self.machine_id, + self.mapping, + self.model, + self.optimizer, + self.loss, + self.log_dir, + **train_params ) def init_comm(self, comm_configs): diff --git a/src/decentralizepy/training/ChangeAccumulator.py b/src/decentralizepy/training/ChangeAccumulator.py index cf9641fc8da1b117671f384650615ec7b4e8c77e..2e105720ebcf175fc07addadbe36954881bbea7d 100644 --- a/src/decentralizepy/training/ChangeAccumulator.py +++ b/src/decentralizepy/training/ChangeAccumulator.py @@ -16,6 +16,9 @@ class ChangeAccumulator(Training): def __init__( self, + rank, + machine_id, + mapping, model, optimizer, loss, @@ -31,6 +34,12 @@ class ChangeAccumulator(Training): Parameters ---------- + rank : int + Rank of process local to the machine + machine_id : int + Machine ID on which the process in running + mapping : decentralizepy.mappings + The object containing the mapping rank <--> uid model : torch.nn.Module Neural Network for training optimizer : torch.optim @@ -52,7 +61,17 @@ class ChangeAccumulator(Training): """ super().__init__( - model, optimizer, loss, log_dir, rounds, full_epochs, batch_size, shuffle + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds, + full_epochs, + batch_size, + shuffle, ) self.save_accumulated = conditional_value(save_accumulated, "", True) self.communication_round = 0 diff --git a/src/decentralizepy/training/GradientAccumulator.py b/src/decentralizepy/training/GradientAccumulator.py index 3171019f7397cb49b757e3a41315ec967fa5d27f..fcff8e6ec56e673c10098d32d973370251740913 100644 --- a/src/decentralizepy/training/GradientAccumulator.py +++ b/src/decentralizepy/training/GradientAccumulator.py @@ -11,6 +11,9 @@ class GradientAccumulator(Training): def __init__( self, + rank, + machine_id, + mapping, model, optimizer, loss, @@ -25,6 +28,12 @@ class GradientAccumulator(Training): Parameters ---------- + rank : int + Rank of process local to the machine + machine_id : int + Machine ID on which the process in running + mapping : decentralizepy.mappings + The object containing the mapping rank <--> uid model : torch.nn.Module Neural Network for training optimizer : torch.optim @@ -44,7 +53,17 @@ class GradientAccumulator(Training): """ super().__init__( - model, optimizer, loss, log_dir, rounds, full_epochs, batch_size, shuffle + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds, + full_epochs, + batch_size, + shuffle, ) def trainstep(self, data, target): diff --git a/src/decentralizepy/training/Training.py b/src/decentralizepy/training/Training.py index 47c8f778fb38761f766504d7461b72c9b2005618..3b99befa457b31580880b3113bee76ba2e4e3feb 100644 --- a/src/decentralizepy/training/Training.py +++ b/src/decentralizepy/training/Training.py @@ -13,6 +13,9 @@ class Training: def __init__( self, + rank, + machine_id, + mapping, model, optimizer, loss, @@ -27,6 +30,12 @@ class Training: Parameters ---------- + rank : int + Rank of process local to the machine + machine_id : int + Machine ID on which the process in running + mapping : decentralizepy.mappings + The object containing the mapping rank <--> uid model : torch.nn.Module Neural Network for training optimizer : torch.optim @@ -49,6 +58,9 @@ class Training: self.optimizer = optimizer self.loss = loss self.log_dir = log_dir + self.rank = rank + self.machine_id = machine_id + self.mapping = mapping self.rounds = utils.conditional_value(rounds, "", int(1)) self.full_epochs = utils.conditional_value(full_epochs, "", False) self.batch_size = utils.conditional_value(batch_size, "", int(1))