Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • sacs/decentralizepy
  • mvujas/decentralizepy
  • randl/decentralizepy
3 results
Show changes
Showing
with 1617 additions and 0 deletions
import json
import os
import sys
import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
def get_stats(l):
assert len(l) > 0
mean_dict, stdev_dict, min_dict, max_dict = {}, {}, {}, {}
for key in l[0].keys():
all_nodes = [i[key] for i in l]
all_nodes = np.array(all_nodes)
mean = np.mean(all_nodes)
std = np.std(all_nodes)
min = np.min(all_nodes)
max = np.max(all_nodes)
mean_dict[int(key)] = mean
stdev_dict[int(key)] = std
min_dict[int(key)] = min
max_dict[int(key)] = max
return mean_dict, stdev_dict, min_dict, max_dict
def plot(means, stdevs, mins, maxs, title, label, loc):
plt.title(title)
plt.xlabel("communication rounds")
x_axis = list(means.keys())
y_axis = list(means.values())
err = list(stdevs.values())
plt.errorbar(x_axis, y_axis, yerr=err, label=label)
plt.legend(loc=loc)
def plot_results(path):
"""
plots the percentiles
Based on plot.py
Parameters
----------
path
path to the folders from which to create the percentiles plots
"""
folders = os.listdir(path)
folders.sort()
print("Reading folders from: ", path)
print("Folders: ", folders)
for folder in folders:
folder_path = os.path.join(path, folder)
if not os.path.isdir(folder_path):
continue
results = []
all_shared_params = []
machine_folders = os.listdir(folder_path)
for machine_folder in machine_folders:
mf_path = os.path.join(folder_path, machine_folder)
if not os.path.isdir(mf_path):
continue
files = os.listdir(mf_path)
shared_params = [f for f in files if f.endswith("_shared_parameters.json")]
files = [f for f in files if f.endswith("_results.json")]
for f in files:
filepath = os.path.join(mf_path, f)
with open(filepath, "r") as inf:
results.append(json.load(inf))
for sp in shared_params:
filepath = os.path.join(mf_path, sp)
with open(filepath, "r") as spf:
all_shared_params.append(np.array(json.load(spf), dtype=np.int32))
# Plot Training loss
plt.figure(1)
# Average of the shared parameters
mean = np.mean(all_shared_params, axis=0)
std = np.std(all_shared_params, axis=0)
with open(
os.path.join(path, "shared_params_avg_" + folder + ".json"), "w"
) as mf:
json.dump(mean.tolist(), mf)
with open(
os.path.join(path, "shared_params_std_" + folder + ".json"), "w"
) as sf:
json.dump(std.tolist(), sf)
# copy jupyter notebook code
percentile = np.percentile(mean, np.arange(0, 100, 1))
plt.plot(np.arange(0, 100, 1), percentile, label=folder)
plt.title("Shared parameters Percentiles")
# plt.ylabel("Absolute frequency value")
plt.xlabel("Percentiles")
plt.xticks(np.arange(0, 110, 10))
plt.legend(loc="lower right")
plt.figure(2)
sort = torch.sort(torch.tensor(mean)).values
print(sort)
length = sort.shape[0]
length = int(length / 20)
bins = [
torch.sum(sort[length * i : length * (i + 1)]).item() for i in range(20)
]
total = np.sum(bins)
perc = bins / total # np.divide(bins, total)
print(perc)
plt.bar(np.arange(0, 97.5, 5), perc, width=5, align="edge", label=folder)
plt.title("Shared parameters Percentiles")
# plt.ylabel("Absolute frequency value")
plt.xlabel("Percentiles")
plt.legend(loc="lower right")
plt.savefig(os.path.join(path, f"percentiles_histogram_{folder}.png"), dpi=300)
plt.clf()
plt.cla()
plt.figure(1)
plt.savefig(os.path.join(path, "percentiles.png"), dpi=300)
if __name__ == "__main__":
assert len(sys.argv) == 2
plot_results(sys.argv[1])
import json
import os
import sys
from pathlib import Path
import numpy as np
from matplotlib import pyplot as plt
def plot(x, y, label, *args):
plt.plot(x, y, *args, label=label)
plt.legend()
def plot_shared(path, title):
model_path = os.path.join(path, "plots")
Path(model_path).mkdir(parents=True, exist_ok=True)
files = [f for f in os.listdir(path) if f.endswith("json")]
assert len(files) > 0
for i, file in enumerate(files):
filepath = os.path.join(path, file)
with open(filepath, "r") as inf:
model_vec = json.load(inf)
del model_vec["order"]
if i == 0:
total_params = 0
for l in model_vec["shapes"].values():
current_params = 1
for v in l:
current_params *= v
total_params += current_params
print("Total Params: ", str(total_params))
shared_count = np.zeros(total_params, dtype=int)
del model_vec["shapes"]
model_vec = np.array(model_vec[list(model_vec.keys())[0]])
shared_count[model_vec] += 1
print("sum: ", np.sum(shared_count))
num_elements = shared_count.shape[0]
x_axis = np.arange(1, num_elements + 1)
plt.clf()
plt.title(title)
plot(x_axis, shared_count, "unsorted", ".")
shared_count = np.sort(shared_count)
plot(x_axis, shared_count, "sorted")
plt.savefig(os.path.join(model_path, "shared_plot.png"))
if __name__ == "__main__":
assert len(sys.argv) == 2
plot_shared(sys.argv[1], "Shared Parameters")
import distutils
import json
import os
import sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
def plot(x_axis, means, stdevs, pos, nb_plots, title, label, loc, xlabel):
cmap = plt.get_cmap("gist_rainbow")
plt.title(title)
plt.xlabel(xlabel)
y_axis = list(means)
err = list(stdevs)
print("label:", label)
print("color: ", cmap(1 / nb_plots * pos))
plt.errorbar(
list(x_axis), y_axis, yerr=err, label=label, color=cmap(1 / nb_plots * pos)
)
plt.legend(loc=loc)
def plot_results(path, epochs, global_epochs="True"):
if global_epochs.lower() in ["true", "1", "t", "y", "yes"]:
global_epochs = True
else:
global_epochs = False
epochs = int(epochs)
# rounds = int(rounds)
folders = os.listdir(path)
folders.sort()
print("Reading folders from: ", path)
print("Folders: ", folders)
bytes_means, bytes_stdevs = {}, {}
meta_means, meta_stdevs = {}, {}
data_means, data_stdevs = {}, {}
files = os.listdir(path)
files = [f for f in files if f.endswith(".csv")]
train_loss = sorted([f for f in files if f.startswith("train_loss")])
test_acc = sorted([f for f in files if f.startswith("test_acc")])
test_loss = sorted([f for f in files if f.startswith("test_loss")])
min_losses = []
for i, f in enumerate(train_loss):
filepath = os.path.join(path, f)
with open(filepath, "r") as inf:
results_csv = pd.read_csv(inf)
# Plot Training loss
plt.figure(1)
if global_epochs:
rounds = results_csv["rounds"].iloc[0]
print("Rounds: ", rounds)
results_cr = results_csv[results_csv.rounds <= epochs * rounds]
means = results_cr["mean"].to_numpy()
stdevs = results_cr["std"].to_numpy()
x_axis = (
results_cr["rounds"].to_numpy() / rounds
) # list(np.arange(0, len(means), 1))
x_label = "global epochs"
else:
results_cr = results_csv[results_csv.rounds <= epochs]
means = results_cr["mean"].to_numpy()
stdevs = results_cr["std"].to_numpy()
x_axis = results_cr["rounds"].to_numpy()
x_label = "communication rounds"
min_losses.append(np.min(means))
plot(
x_axis,
means,
stdevs,
i,
len(train_loss),
"Training Loss",
f[len("train_loss") + 1 : -len(":2022-03-24T17:54.csv")],
"upper right",
x_label,
)
min_tlosses = []
for i, f in enumerate(test_loss):
filepath = os.path.join(path, f)
with open(filepath, "r") as inf:
results_csv = pd.read_csv(inf)
if global_epochs:
rounds = results_csv["rounds"].iloc[0]
print("Rounds: ", rounds)
results_cr = results_csv[results_csv.rounds <= epochs * rounds]
means = results_cr["mean"].to_numpy()
stdevs = results_cr["std"].to_numpy()
x_axis = (
results_cr["rounds"].to_numpy() / rounds
) # list(np.arange(0, len(means), 1))
x_label = "global epochs"
else:
results_cr = results_csv[results_csv.rounds <= epochs]
means = results_cr["mean"].to_numpy()
stdevs = results_cr["std"].to_numpy()
x_axis = results_cr["rounds"].to_numpy()
x_label = "communication rounds"
print("x axis:", x_axis)
min_tlosses.append(np.min(means))
# Plot Testing loss
plt.figure(2)
plot(
x_axis,
means,
stdevs,
i,
len(test_loss),
"Testing Loss",
f[len("test_loss") + 1 : -len(":2022-03-24T17:54.csv")],
"upper right",
x_label,
)
max_taccs = []
for i, f in enumerate(test_acc):
filepath = os.path.join(path, f)
with open(filepath, "r") as inf:
results_csv = pd.read_csv(inf)
if global_epochs:
rounds = results_csv["rounds"].iloc[0]
print("Rounds: ", rounds)
results_cr = results_csv[results_csv.rounds <= epochs * rounds]
means = results_cr["mean"].to_numpy()
stdevs = results_cr["std"].to_numpy()
x_axis = (
results_cr["rounds"].to_numpy() / rounds
) # list(np.arange(0, len(means), 1))
x_label = "global epochs"
else:
results_cr = results_csv[results_csv.rounds <= epochs]
means = results_cr["mean"].to_numpy()
stdevs = results_cr["std"].to_numpy()
x_axis = results_cr["rounds"].to_numpy()
x_label = "communication rounds"
max_taccs.append(np.max(means))
# Plot Testing Accuracy
plt.figure(3)
plot(
x_axis,
means,
stdevs,
i,
len(test_acc),
"Testing Accuracy",
f[len("test_acc") + 1 : -len(":2022-03-24T17:54.csv")],
"lower right",
x_label,
)
names_loss = [
f[len("train_loss") + 1 : -len(":2022-03-24T17:54.csv")] for f in train_loss
]
names_acc = [
f[len("test_acc") + 1 : -len(":2022-03-24T17:54.csv")] for f in test_acc
]
print(names_loss)
print(names_acc)
pf = pd.DataFrame(
{
"test_accuracy": max_taccs,
"test_losses": min_tlosses,
"train_losses": min_losses,
},
names_loss,
)
pf = pf.sort_values(["test_accuracy"], 0, ascending=False)
pf.to_csv(os.path.join(path, "best_results.csv"))
plt.figure(1)
plt.savefig(os.path.join(path, "ge_train_loss.png"), dpi=300)
plt.figure(2)
plt.savefig(os.path.join(path, "ge_test_loss.png"), dpi=300)
plt.figure(3)
plt.savefig(os.path.join(path, "ge_test_acc.png"), dpi=300)
if __name__ == "__main__":
assert len(sys.argv) == 4
# The args are:
# 1: the folder with the csv files,
# 2: the number of epochs / comm rounds to plot for,
# 3: True/False with True meaning plot global epochs and False plot communication rounds
print(sys.argv[1], sys.argv[2], sys.argv[3])
plot_results(sys.argv[1], sys.argv[2], sys.argv[3])
#!/bin/bash
script_path=$(realpath $(dirname $0))
# Working directory, where config files are read from and logs are written.
decpy_path=/mnt/nfs/$(whoami)/decpy_workingdir
cd $decpy_path
# Python interpreter
env_python=python3
# File regular_16.txt is available in /tutorial
graph=$decpy_path/regular_16.txt
# File config_celeba_sharing.ini is available in /tutorial
# In this config file, change addresses_filepath to correspond to your list of machines (example in /tutorial/ip.json)
original_config=$decpy_path/config_celeba_sharing.ini
# Local config file
config_file=/tmp/$(basename $original_config)
# Python script to be executed
eval_file=$script_path/testingPeerSampler.py
# General parameters
procs_per_machine=8
machines=2
iterations=5
test_after=2
log_level=INFO
m=`cat $(grep addresses_filepath $original_config | awk '{print $3}') | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
echo M is $m
log_dir=$(date '+%Y-%m-%dT%H:%M')/machine$m
mkdir -p $log_dir
# Copy and manipulate the local config file
cp $original_config $config_file
# echo "alpha = 0.10" >> $config_file
$env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level -wsd $log_dir
#!/bin/bash
nfs_home=$1
python_bin=$2
decpy_path=$nfs_home/decentralizepy/eval
cd $decpy_path
env_python=$python_bin/python3
graph=96_regular.edges #4_node_fullyConnected.edges
config_file=~/tmp/config.ini
procs_per_machine=16
machines=6
iterations=5
train_evaluate_after=5
test_after=21 # we do not test
eval_file=testing.py
log_level=INFO
ip_machines=$nfs_home/configs/ip_addr_6Machines.json
m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
export PYTHONFAULTHANDLER=1
tests=("step_configs/config_celeba_partialmodel.ini" "step_configs/config_celeba_sharing.ini" "step_configs/config_celeba_fft.ini" "step_configs/config_celeba_wavelet.ini"
"step_configs/config_celeba_grow.ini" "step_configs/config_celeba_manualadapt.ini" "step_configs/config_celeba_randomalpha.ini"
"step_configs/config_celeba_randomalphainc.ini" "step_configs/config_celeba_roundrobin.ini" "step_configs/config_celeba_subsampling.ini"
"step_configs/config_celeba_topkrandom.ini" "step_configs/config_celeba_topkacc.ini" "step_configs/config_celeba_topkparam.ini")
for i in "${tests[@]}"
do
echo $i
IFS='_' read -ra NAMES <<< $i
IFS='.' read -ra NAME <<< ${NAMES[-1]}
log_dir=$nfs_home/logs/testing/${NAME[0]}$(date '+%Y-%m-%dT%H:%M')/machine$m
mkdir -p $log_dir
cp $i $config_file
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$env_python $eval_file -ro 0 -tea $train_evaluate_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 3
echo end of sleep
done
#!/bin/bash
# Documentation
# Note: documentation was not written for this run file, so actual behaviour may differ
# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
# The second one (python_bin) is the path to the python bin folder.
# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
#
# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
# called configs which contains the file 'ip_addr_6Machines.json'
# The python bin folder needs to include all the dependencies of this project including crudini.
# The results will be stored in $nfs_home/$logs_subfolder
# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
# was run.
# Example: ./run_grid.sh /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
#
# Additional requirements:
# Each node needs a folder called 'tmp' in the user's home directory
#
# Note:
# - The script does not change the optimizer. All configs are writen to use Adam.
# For SGD these need to be changed manually
# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
# on the train set and on the test set is carried out every global epoch.
# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
# relevant for Adams and other optimizers with internal state)
#
# Addapting the script to other datasets:
# Change the variable 'dataset_size' to reflect the data sets size.
#
# Known issues:
# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
# machines may start running the script at the exact same moment.
nfs_home=$1
python_bin=$2
logs_subfolder=$3
decpy_path=$nfs_home/decentralizepy/eval
cd $decpy_path
env_python=$python_bin/python3
graph=6_star.edges
config_file=~/tmp/config.ini
procs_per_machine=6
machines=1
global_epochs=20
eval_file=testing.py
log_level=INFO
ip_machines=$nfs_home/configs/ip_addr_6Machines.json
m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
# Base configs for which the gird search is done
tests=("step_configs/config_celeba_synchronous.ini")
# Learning rates
lr="0.001"
# Batch size
batchsize="8"
# The number of communication rounds per global epoch
comm_rounds_per_global_epoch="2000"
# testing every x communication rounds
procs=`expr $procs_per_machine \* $machines`
echo procs: $procs
# Celeba has 63741 samples
# Reddit has 70642
# Femnist 734463
# Shakespeares 3678451
dataset_size=63741
# Calculating the number of samples that each user/proc will have on average
samples_per_user=`expr $dataset_size / $procs`
echo samples per user: $samples_per_user
# random_seeds for which to rerun the experiments
random_seeds=("90" "91" "92" "93" "94")
# random_seed = 97
echo batchsize: $batchsize
echo communication rounds per global epoch: $comm_rounds_per_global_epoch
# calculating how many batches there are in a global epoch for each user/proc
batches_per_epoch=$(($samples_per_user / $batchsize))
echo batches per global epoch: $batches_per_epoch
# the number of iterations in 25 global epochs
iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $comm_rounds_per_global_epoch >= $batches_per_epoch else print($global_epochs * $comm_rounds_per_global_epoch)")
echo iterations: $iterations
# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $comm_rounds_per_global_epoch); print(1 if x==0 else x)")
# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $comm_rounds_per_global_epoch); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $comm_rounds_per_global_epoch)/x)*$iterations); print($iterations if y<$iterations else y)")
echo batches per communication round: $batches_per_comm_round
echo corrected iterations: $new_iterations
test_after=$(($new_iterations / $global_epochs))
echo test after: $test_after
for i in "${tests[@]}"
do
for seed in "${random_seeds[@]}"
do
echo $i
IFS='_' read -ra NAMES <<< $i
IFS='.' read -ra NAME <<< ${NAMES[-1]}
log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$comm_rounds_per_global_epoch:b=$batchsize:$(date '+%Y-%m-%dT%H:%M')/machine$m
echo results are stored in: $log_dir
mkdir -p $log_dir
cp $i $config_file
# changing the config files to reflect the values of the current grid search state
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
$python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
$python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $batchsize
$python_bin/crudini --set $config_file DATASET random_seed $seed
$env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 200
echo end of sleep
done
done
#
\ No newline at end of file
#!/bin/bash
# Documentation
# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
# The second one (python_bin) is the path to the python bin folder.
# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
#
# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
# called configs which contains the file 'ip_addr_6Machines.json'
# The python bin folder needs to include all the dependencies of this project including crudini.
# The results will be stored in $nfs_home/$logs_subfolder
# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
# was run.
# Example: ./run_grid.sh /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
#
# Additional requirements:
# Each node needs a folder called 'tmp' in the user's home directory
#
# Note:
# - The script does not change the optimizer. All configs are writen to use SGD.
# - The script will set '--test_after' and '--train_evaluate_after' such that it happens at the end of a global epoch.
# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
# relevant for Adams and other optimizers with internal state)
#
# Addapting the script to other datasets:
# Change the variable 'dataset_size' to reflect the data sets size.
#
# Known issues:
# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
# machines may start running the script at the exact same moment.
nfs_home=$1
python_bin=$2
logs_subfolder=$3
decpy_path=$nfs_home/decentralizepy/eval
cd $decpy_path
env_python=$python_bin/python3
graph=96_regular.edges
config_file=~/tmp/config.ini
procs_per_machine=16
machines=6
global_epochs=25
eval_file=testing.py
log_level=INFO
ip_machines=$nfs_home/configs/ip_addr_6Machines.json
m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
export PYTHONFAULTHANDLER=1
# Base configs for which the gird search is done
tests=("step_configs/config_celeba_sharing.ini")
# Learning rates to test
lrs=( "0.001" "0.0001" "0.0001")
# Batch sizes to test
batchsize=("8" "16")
# The number of communication rounds per global epoch to test
comm_rounds_per_global_epoch=("1" "5" "10")
procs=`expr $procs_per_machine \* $machines`
echo procs: $procs
dataset_size=63741
# Calculating the number of samples that each user/proc will have on average
samples_per_user=`expr $dataset_size / $procs`
echo samples per user: $samples_per_user
for b in "${batchsize[@]}"
do
echo batchsize: $b
for r in "${comm_rounds_per_global_epoch[@]}"
do
echo communication rounds per global epoch: $r
# calculating how many batches there are in a global epoch for each user/proc
batches_per_epoch=$(($samples_per_user / $b))
echo batches per global epoch: $batches_per_epoch
# the number of iterations in 25 global epochs
iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $r >= $batches_per_epoch else print($global_epochs * $r)")
echo iterations: $iterations
# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $r); print(1 if x==0 else x)")
# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $r); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $r)/x)*$iterations); print($iterations if y<$iterations else y)")
echo batches per communication round: $batches_per_comm_round
echo corrected iterations: $new_iterations
test_after=$(($new_iterations / $global_epochs))
echo test after: $test_after
for lr in "${lrs[@]}"
do
for i in "${tests[@]}"
do
echo $i
IFS='_' read -ra NAMES <<< $i
IFS='.' read -ra NAME <<< ${NAMES[-1]}
log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$r:b=$b:$(date '+%Y-%m-%dT%H:%M')/machine$m
echo results are stored in: $log_dir
mkdir -p $log_dir
cp $i $config_file
# changing the config files to reflect the values of the current grid search state
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
$python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
$python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $b
$env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 1
echo end of sleep
done
done
done
done
#
#!/bin/bash
# Documentation
# Note: documentation was not written for this run file, so actual behaviour may differ
# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
# The second one (python_bin) is the path to the python bin folder.
# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
#
# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
# called configs which contains the file 'ip_addr_6Machines.json'
# The python bin folder needs to include all the dependencies of this project including crudini.
# The results will be stored in $nfs_home/$logs_subfolder
# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
# was run.
# Example: ./run_grid.sh /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
#
# Additional requirements:
# Each node needs a folder called 'tmp' in the user's home directory
#
# Note:
# - The script does not change the optimizer. All configs are writen to use Adam.
# For SGD these need to be changed manually
# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
# on the train set and on the test set is carried out every global epoch.
# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
# relevant for Adams and other optimizers with internal state)
#
# Addapting the script to other datasets:
# Change the variable 'dataset_size' to reflect the data sets size.
#
# Known issues:
# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
# machines may start running the script at the exact same moment.
nfs_home=$1
python_bin=$2
logs_subfolder=$3
decpy_path=$nfs_home/decentralizepy/eval
cd $decpy_path
env_python=$python_bin/python3
config_file=~/tmp/config.ini
procs_per_machine=16
machines=6
global_epochs=150
eval_file=testingPeerSampler.py
log_level=INFO
ip_machines=$nfs_home/$logs_subfolder/ip_addr_6Machines.json
m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
export PYTHONFAULTHANDLER=1
# Base configs for which the gird search is done
tests="$nfs_home/$logs_subfolder/config.ini"
#tests=("$nfs_home/$logs_subfolder/config_cifar_sharing.ini" "$nfs_home/$logs_subfolder/config_cifar_partialmodel.ini" "$nfs_home/$logs_subfolder/config_cifar_topkacc.ini" "$nfs_home/$logs_subfolder/config_cifar_topkaccRandomAlpha.ini" "$nfs_home/$logs_subfolder/config_cifar_subsampling.ini" "$nfs_home/$logs_subfolder/config_cifar_wavelet.ini" "$nfs_home/$logs_subfolder/config_cifar_waveletRandomAlpha.ini")
#tests=("$nfs_home/$logs_subfolder/config_cifar_partialmodel.ini" "$nfs_home/$logs_subfolder/config_cifar_topkacc.ini" "$nfs_home/$logs_subfolder/config_cifar_topkaccRandomAlpha.ini" "$nfs_home/$logs_subfolder/config_cifar_subsampling.ini" "$nfs_home/$logs_subfolder/config_cifar_wavelet.ini" "$nfs_home/$logs_subfolder/config_cifar_waveletRandomAlpha.ini")
#tests=("$nfs_home/$logs_subfolder/config_cifar_subsampling.ini" "$nfs_home/$logs_subfolder/config_cifar_sharing.ini" "$nfs_home/$logs_subfolder/config_cifar_waveletRandomAlpha.ini")
#tests=("$nfs_home/$logs_subfolder/config_cifar_waveletRandomAlpha.ini")
# Learning rates
lr="0.01"
# Batch size
batchsize="8"
# The number of communication rounds per global epoch
comm_rounds_per_global_epoch="20"
procs=`expr $procs_per_machine \* $machines`
echo procs: $procs
# Celeba has 63741 samples
# Reddit has 70642
# Femnist 734463
# Shakespeares 3678451
dataset_size=50000
# Calculating the number of samples that each user/proc will have on average
samples_per_user=`expr $dataset_size / $procs`
echo samples per user: $samples_per_user
# random_seeds for which to rerun the experiments
# random_seeds=("90" "91" "92" "93" "94")
random_seeds=("94")
# random_seed = 97
echo batchsize: $batchsize
echo communication rounds per global epoch: $comm_rounds_per_global_epoch
# calculating how many batches there are in a global epoch for each user/proc
batches_per_epoch=$(($samples_per_user / $batchsize))
echo batches per global epoch: $batches_per_epoch
# the number of iterations in 25 global epochs
iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $comm_rounds_per_global_epoch >= $batches_per_epoch else print($global_epochs * $comm_rounds_per_global_epoch)")
echo iterations: $iterations
# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $comm_rounds_per_global_epoch); print(1 if x==0 else x)")
# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $comm_rounds_per_global_epoch); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $comm_rounds_per_global_epoch)/x)*$iterations); print($iterations if y<$iterations else y)")
test_after=$(($new_iterations / $global_epochs))
echo test after: $test_after
echo batches per communication round: $batches_per_comm_round
echo corrected iterations: $new_iterations
for i in "${tests[@]}"
do
for seed in "${random_seeds[@]}"
do
echo $i
IFS='_' read -ra NAMES <<< $i
IFS='.' read -ra NAME <<< ${NAMES[-1]}
#log_dir_base=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$comm_rounds_per_global_epoch:b=$batchsize:$(date '+%Y-%m-%dT%H:%M')
log_dir_base=$nfs_home/$logs_subfolder/lr=$lr:r=$comm_rounds_per_global_epoch:b=$batchsize:$(date '+%Y-%m-%dT%H:%M')
echo results are stored in: $log_dir_base
log_dir=$log_dir_base/machine$m
mkdir -p $log_dir
weight_store_dir=$log_dir_base/weights
mkdir -p $weight_store_dir
graph=$nfs_home/decentralizepy/eval/96_regular.edges
cp $i $config_file
# changing the config files to reflect the values of the current grid search state
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
$python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
$python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $batchsize
$python_bin/crudini --set $config_file DATASET random_seed $seed
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$python_bin/crudini --set $config_file COMMUNICATION offset 10720
# $env_python $eval_file -cte 0 -ro 0 -tea $test_after -ld $log_dir -wsd $weight_store_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
$env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 200
echo end of sleep
done
done
#
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.FFT
sharing_class = FFT
alpha = 0.1
change_based_selection = True
accumulation = True
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.GrowingAlpha
sharing_class = GrowingAlpha
init_alpha=0.10
max_alpha=0.75
k=6
metadata_cap=0.65
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.