Compare revisions

d140bf44 · d140bf44 · d140bf44 · d140bf44 · d140bf44 · d140bf44
--- a/eval/plot_percentile.py
+++ b/eval/plot_percentile.py
+import json
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+import torch
+from matplotlib import pyplot as plt
+
+
+def get_stats(l):
+    assert len(l) > 0
+    mean_dict, stdev_dict, min_dict, max_dict = {}, {}, {}, {}
+    for key in l[0].keys():
+        all_nodes = [i[key] for i in l]
+        all_nodes = np.array(all_nodes)
+        mean = np.mean(all_nodes)
+        std = np.std(all_nodes)
+        min = np.min(all_nodes)
+        max = np.max(all_nodes)
+        mean_dict[int(key)] = mean
+        stdev_dict[int(key)] = std
+        min_dict[int(key)] = min
+        max_dict[int(key)] = max
+    return mean_dict, stdev_dict, min_dict, max_dict
+
+
+def plot(means, stdevs, mins, maxs, title, label, loc):
+    plt.title(title)
+    plt.xlabel("communication rounds")
+    x_axis = list(means.keys())
+    y_axis = list(means.values())
+    err = list(stdevs.values())
+    plt.errorbar(x_axis, y_axis, yerr=err, label=label)
+    plt.legend(loc=loc)
+
+
+def plot_results(path):
+    """
+    plots the percentiles
+    Based on plot.py
+    Parameters
+    ----------
+    path
+        path to the folders from which to create the percentiles plots
+
+    """
+    folders = os.listdir(path)
+    folders.sort()
+    print("Reading folders from: ", path)
+    print("Folders: ", folders)
+    for folder in folders:
+        folder_path = os.path.join(path, folder)
+        if not os.path.isdir(folder_path):
+            continue
+        results = []
+        all_shared_params = []
+        machine_folders = os.listdir(folder_path)
+        for machine_folder in machine_folders:
+            mf_path = os.path.join(folder_path, machine_folder)
+            if not os.path.isdir(mf_path):
+                continue
+            files = os.listdir(mf_path)
+            shared_params = [f for f in files if f.endswith("_shared_parameters.json")]
+            files = [f for f in files if f.endswith("_results.json")]
+            for f in files:
+                filepath = os.path.join(mf_path, f)
+                with open(filepath, "r") as inf:
+                    results.append(json.load(inf))
+            for sp in shared_params:
+                filepath = os.path.join(mf_path, sp)
+                with open(filepath, "r") as spf:
+                    all_shared_params.append(np.array(json.load(spf), dtype=np.int32))
+
+        # Plot Training loss
+        plt.figure(1)
+        # Average of the shared parameters
+        mean = np.mean(all_shared_params, axis=0)
+        std = np.std(all_shared_params, axis=0)
+        with open(
+            os.path.join(path, "shared_params_avg_" + folder + ".json"), "w"
+        ) as mf:
+            json.dump(mean.tolist(), mf)
+
+        with open(
+            os.path.join(path, "shared_params_std_" + folder + ".json"), "w"
+        ) as sf:
+            json.dump(std.tolist(), sf)
+
+        # copy jupyter notebook code
+        percentile = np.percentile(mean, np.arange(0, 100, 1))
+        plt.plot(np.arange(0, 100, 1), percentile, label=folder)
+        plt.title("Shared parameters Percentiles")
+        # plt.ylabel("Absolute frequency value")
+        plt.xlabel("Percentiles")
+        plt.xticks(np.arange(0, 110, 10))
+        plt.legend(loc="lower right")
+
+        plt.figure(2)
+        sort = torch.sort(torch.tensor(mean)).values
+        print(sort)
+        length = sort.shape[0]
+        length = int(length / 20)
+        bins = [
+            torch.sum(sort[length * i : length * (i + 1)]).item() for i in range(20)
+        ]
+        total = np.sum(bins)
+        perc = bins / total  # np.divide(bins, total)
+        print(perc)
+        plt.bar(np.arange(0, 97.5, 5), perc, width=5, align="edge", label=folder)
+
+        plt.title("Shared parameters Percentiles")
+        # plt.ylabel("Absolute frequency value")
+        plt.xlabel("Percentiles")
+        plt.legend(loc="lower right")
+        plt.savefig(os.path.join(path, f"percentiles_histogram_{folder}.png"), dpi=300)
+        plt.clf()
+        plt.cla()
+
+    plt.figure(1)
+    plt.savefig(os.path.join(path, "percentiles.png"), dpi=300)
+
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 2
+    plot_results(sys.argv[1])
--- a/eval/plot_shared.py
+++ b/eval/plot_shared.py
+import json
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+
+def plot(x, y, label, *args):
+    plt.plot(x, y, *args, label=label)
+    plt.legend()
+
+
+def plot_shared(path, title):
+    model_path = os.path.join(path, "plots")
+    Path(model_path).mkdir(parents=True, exist_ok=True)
+    files = [f for f in os.listdir(path) if f.endswith("json")]
+    assert len(files) > 0
+    for i, file in enumerate(files):
+        filepath = os.path.join(path, file)
+        with open(filepath, "r") as inf:
+            model_vec = json.load(inf)
+            del model_vec["order"]
+            if i == 0:
+                total_params = 0
+                for l in model_vec["shapes"].values():
+                    current_params = 1
+                    for v in l:
+                        current_params *= v
+                    total_params += current_params
+                print("Total Params: ", str(total_params))
+                shared_count = np.zeros(total_params, dtype=int)
+            del model_vec["shapes"]
+            model_vec = np.array(model_vec[list(model_vec.keys())[0]])
+        shared_count[model_vec] += 1
+    print("sum: ", np.sum(shared_count))
+    num_elements = shared_count.shape[0]
+    x_axis = np.arange(1, num_elements + 1)
+    plt.clf()
+    plt.title(title)
+    plot(x_axis, shared_count, "unsorted", ".")
+    shared_count = np.sort(shared_count)
+    plot(x_axis, shared_count, "sorted")
+    plt.savefig(os.path.join(model_path, "shared_plot.png"))
+
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 2
+    plot_shared(sys.argv[1], "Shared Parameters")
--- a/eval/plotting_from_csv.py
+++ b/eval/plotting_from_csv.py
+import distutils
+import json
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+
+
+def plot(x_axis, means, stdevs, pos, nb_plots, title, label, loc, xlabel):
+    cmap = plt.get_cmap("gist_rainbow")
+    plt.title(title)
+    plt.xlabel(xlabel)
+    y_axis = list(means)
+    err = list(stdevs)
+    print("label:", label)
+    print("color: ", cmap(1 / nb_plots * pos))
+    plt.errorbar(
+        list(x_axis), y_axis, yerr=err, label=label, color=cmap(1 / nb_plots * pos)
+    )
+    plt.legend(loc=loc)
+
+
+def plot_results(path, epochs, global_epochs="True"):
+    if global_epochs.lower() in ["true", "1", "t", "y", "yes"]:
+        global_epochs = True
+    else:
+        global_epochs = False
+    epochs = int(epochs)
+    # rounds = int(rounds)
+    folders = os.listdir(path)
+    folders.sort()
+    print("Reading folders from: ", path)
+    print("Folders: ", folders)
+    bytes_means, bytes_stdevs = {}, {}
+    meta_means, meta_stdevs = {}, {}
+    data_means, data_stdevs = {}, {}
+
+    files = os.listdir(path)
+    files = [f for f in files if f.endswith(".csv")]
+    train_loss = sorted([f for f in files if f.startswith("train_loss")])
+    test_acc = sorted([f for f in files if f.startswith("test_acc")])
+    test_loss = sorted([f for f in files if f.startswith("test_loss")])
+    min_losses = []
+    for i, f in enumerate(train_loss):
+        filepath = os.path.join(path, f)
+        with open(filepath, "r") as inf:
+            results_csv = pd.read_csv(inf)
+        # Plot Training loss
+        plt.figure(1)
+        if global_epochs:
+            rounds = results_csv["rounds"].iloc[0]
+            print("Rounds: ", rounds)
+            results_cr = results_csv[results_csv.rounds <= epochs * rounds]
+            means = results_cr["mean"].to_numpy()
+            stdevs = results_cr["std"].to_numpy()
+            x_axis = (
+                results_cr["rounds"].to_numpy() / rounds
+            )  # list(np.arange(0, len(means), 1))
+            x_label = "global epochs"
+        else:
+            results_cr = results_csv[results_csv.rounds <= epochs]
+            means = results_cr["mean"].to_numpy()
+            stdevs = results_cr["std"].to_numpy()
+            x_axis = results_cr["rounds"].to_numpy()
+            x_label = "communication rounds"
+        min_losses.append(np.min(means))
+
+        plot(
+            x_axis,
+            means,
+            stdevs,
+            i,
+            len(train_loss),
+            "Training Loss",
+            f[len("train_loss") + 1 : -len(":2022-03-24T17:54.csv")],
+            "upper right",
+            x_label,
+        )
+
+    min_tlosses = []
+    for i, f in enumerate(test_loss):
+        filepath = os.path.join(path, f)
+        with open(filepath, "r") as inf:
+            results_csv = pd.read_csv(inf)
+        if global_epochs:
+            rounds = results_csv["rounds"].iloc[0]
+            print("Rounds: ", rounds)
+            results_cr = results_csv[results_csv.rounds <= epochs * rounds]
+            means = results_cr["mean"].to_numpy()
+            stdevs = results_cr["std"].to_numpy()
+            x_axis = (
+                results_cr["rounds"].to_numpy() / rounds
+            )  # list(np.arange(0, len(means), 1))
+            x_label = "global epochs"
+        else:
+            results_cr = results_csv[results_csv.rounds <= epochs]
+            means = results_cr["mean"].to_numpy()
+            stdevs = results_cr["std"].to_numpy()
+            x_axis = results_cr["rounds"].to_numpy()
+            x_label = "communication rounds"
+        print("x axis:", x_axis)
+        min_tlosses.append(np.min(means))
+        # Plot Testing loss
+        plt.figure(2)
+        plot(
+            x_axis,
+            means,
+            stdevs,
+            i,
+            len(test_loss),
+            "Testing Loss",
+            f[len("test_loss") + 1 : -len(":2022-03-24T17:54.csv")],
+            "upper right",
+            x_label,
+        )
+
+    max_taccs = []
+    for i, f in enumerate(test_acc):
+        filepath = os.path.join(path, f)
+        with open(filepath, "r") as inf:
+            results_csv = pd.read_csv(inf)
+        if global_epochs:
+            rounds = results_csv["rounds"].iloc[0]
+            print("Rounds: ", rounds)
+            results_cr = results_csv[results_csv.rounds <= epochs * rounds]
+            means = results_cr["mean"].to_numpy()
+            stdevs = results_cr["std"].to_numpy()
+            x_axis = (
+                results_cr["rounds"].to_numpy() / rounds
+            )  # list(np.arange(0, len(means), 1))
+            x_label = "global epochs"
+        else:
+            results_cr = results_csv[results_csv.rounds <= epochs]
+            means = results_cr["mean"].to_numpy()
+            stdevs = results_cr["std"].to_numpy()
+            x_axis = results_cr["rounds"].to_numpy()
+            x_label = "communication rounds"
+        max_taccs.append(np.max(means))
+        # Plot Testing Accuracy
+        plt.figure(3)
+        plot(
+            x_axis,
+            means,
+            stdevs,
+            i,
+            len(test_acc),
+            "Testing Accuracy",
+            f[len("test_acc") + 1 : -len(":2022-03-24T17:54.csv")],
+            "lower right",
+            x_label,
+        )
+
+    names_loss = [
+        f[len("train_loss") + 1 : -len(":2022-03-24T17:54.csv")] for f in train_loss
+    ]
+    names_acc = [
+        f[len("test_acc") + 1 : -len(":2022-03-24T17:54.csv")] for f in test_acc
+    ]
+    print(names_loss)
+    print(names_acc)
+    pf = pd.DataFrame(
+        {
+            "test_accuracy": max_taccs,
+            "test_losses": min_tlosses,
+            "train_losses": min_losses,
+        },
+        names_loss,
+    )
+    pf = pf.sort_values(["test_accuracy"], 0, ascending=False)
+    pf.to_csv(os.path.join(path, "best_results.csv"))
+
+    plt.figure(1)
+    plt.savefig(os.path.join(path, "ge_train_loss.png"), dpi=300)
+    plt.figure(2)
+    plt.savefig(os.path.join(path, "ge_test_loss.png"), dpi=300)
+    plt.figure(3)
+    plt.savefig(os.path.join(path, "ge_test_acc.png"), dpi=300)
+
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 4
+    # The args are:
+    # 1: the folder with the csv files,
+    # 2: the number of epochs / comm rounds to plot for,
+    # 3: True/False with True meaning plot global epochs and False plot communication rounds
+    print(sys.argv[1], sys.argv[2], sys.argv[3])
+    plot_results(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/eval/run.sh
+++ b/eval/run.sh
+#!/bin/bash
+script_path=$(realpath $(dirname $0))
+
+# Working directory, where config files are read from and logs are written.
+decpy_path=/mnt/nfs/$(whoami)/decpy_workingdir
+cd $decpy_path
+
+# Python interpreter
+env_python=python3
+
+# File regular_16.txt is available in /tutorial
+graph=$decpy_path/regular_16.txt
+
+# File config_celeba_sharing.ini is available in /tutorial
+# In this config file, change addresses_filepath to correspond to your list of machines (example in /tutorial/ip.json)
+original_config=$decpy_path/config_celeba_sharing.ini
+
+# Local config file
+config_file=/tmp/$(basename $original_config)
+
+# Python script to be executed
+eval_file=$script_path/testingPeerSampler.py
+
+# General parameters
+procs_per_machine=8
+machines=2
+iterations=5
+test_after=2
+log_level=INFO
+
+m=`cat $(grep addresses_filepath $original_config | awk '{print $3}') | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
+echo M is $m
+log_dir=$(date '+%Y-%m-%dT%H:%M')/machine$m
+mkdir -p $log_dir
+
+# Copy and manipulate the local config file
+cp $original_config $config_file
+# echo "alpha = 0.10" >> $config_file
+
+$env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level -wsd $log_dir
--- a/eval/run_all.sh
+++ b/eval/run_all.sh
+#!/bin/bash
+nfs_home=$1
+python_bin=$2
+decpy_path=$nfs_home/decentralizepy/eval
+cd $decpy_path
+
+env_python=$python_bin/python3
+graph=96_regular.edges #4_node_fullyConnected.edges
+config_file=~/tmp/config.ini
+procs_per_machine=16
+machines=6
+iterations=5
+train_evaluate_after=5
+test_after=21 # we do not test
+eval_file=testing.py
+log_level=INFO
+
+ip_machines=$nfs_home/configs/ip_addr_6Machines.json
+
+m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
+export PYTHONFAULTHANDLER=1
+tests=("step_configs/config_celeba_partialmodel.ini" "step_configs/config_celeba_sharing.ini" "step_configs/config_celeba_fft.ini" "step_configs/config_celeba_wavelet.ini"
+"step_configs/config_celeba_grow.ini" "step_configs/config_celeba_manualadapt.ini" "step_configs/config_celeba_randomalpha.ini"
+"step_configs/config_celeba_randomalphainc.ini" "step_configs/config_celeba_roundrobin.ini" "step_configs/config_celeba_subsampling.ini"
+"step_configs/config_celeba_topkrandom.ini" "step_configs/config_celeba_topkacc.ini" "step_configs/config_celeba_topkparam.ini")
+
+for i in "${tests[@]}"
+do
+  echo $i
+  IFS='_' read -ra NAMES <<< $i
+  IFS='.' read -ra NAME <<< ${NAMES[-1]}
+  log_dir=$nfs_home/logs/testing/${NAME[0]}$(date '+%Y-%m-%dT%H:%M')/machine$m
+  mkdir -p $log_dir
+  cp $i $config_file
+  $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
+  $env_python $eval_file -ro 0 -tea $train_evaluate_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
+  echo $i is done
+  sleep 3
+  echo end of sleep
+done
--- a/eval/run_celeba_synchronous.sh
+++ b/eval/run_celeba_synchronous.sh
+#!/bin/bash
+# Documentation
+# Note: documentation was not written for this run file, so actual behaviour may differ
+# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
+# The second one (python_bin) is the path to the python bin folder.
+# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
+#
+# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
+# called configs which contains the file 'ip_addr_6Machines.json'
+# The python bin folder needs to include all the dependencies of this project including crudini.
+# The results will be stored in $nfs_home/$logs_subfolder
+# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
+# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
+# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
+# was run.
+# Example: ./run_grid.sh  /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
+#
+# Additional requirements:
+# Each node needs a folder called 'tmp' in the user's home directory
+#
+# Note:
+# - The script does not change the optimizer. All configs are writen to use Adam.
+#   For SGD these need to be changed manually
+# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
+#   on the train set and on the test set is carried out every global epoch.
+# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
+#   relevant for Adams and other optimizers with internal state)
+#
+# Addapting the script to other datasets:
+# Change the variable 'dataset_size' to reflect the data sets size.
+#
+# Known issues:
+# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
+#   machines may start running the script at the exact same moment.
+
+nfs_home=$1
+python_bin=$2
+logs_subfolder=$3
+decpy_path=$nfs_home/decentralizepy/eval
+cd $decpy_path
+
+env_python=$python_bin/python3
+graph=6_star.edges
+config_file=~/tmp/config.ini
+procs_per_machine=6
+machines=1
+global_epochs=20
+eval_file=testing.py
+log_level=INFO
+
+ip_machines=$nfs_home/configs/ip_addr_6Machines.json
+
+m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
+
+# Base configs for which the gird search is done
+tests=("step_configs/config_celeba_synchronous.ini")
+# Learning rates
+lr="0.001"
+# Batch size
+batchsize="8"
+# The number of communication rounds per global epoch
+comm_rounds_per_global_epoch="2000"
+# testing every x communication rounds
+procs=`expr $procs_per_machine \* $machines`
+echo procs: $procs
+# Celeba has 63741 samples
+# Reddit has 70642
+# Femnist 734463
+# Shakespeares 3678451
+dataset_size=63741
+# Calculating the number of samples that each user/proc will have on average
+samples_per_user=`expr $dataset_size / $procs`
+echo samples per user: $samples_per_user
+
+# random_seeds for which to rerun the experiments
+random_seeds=("90" "91" "92" "93" "94")
+# random_seed = 97
+echo batchsize: $batchsize
+echo communication rounds per global epoch: $comm_rounds_per_global_epoch
+# calculating how many batches there are in a global epoch for each user/proc
+batches_per_epoch=$(($samples_per_user / $batchsize))
+echo batches per global epoch: $batches_per_epoch
+# the number of iterations in 25 global epochs
+iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $comm_rounds_per_global_epoch >= $batches_per_epoch else print($global_epochs * $comm_rounds_per_global_epoch)")
+echo iterations: $iterations
+# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
+batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $comm_rounds_per_global_epoch); print(1 if x==0 else x)")
+# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
+new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $comm_rounds_per_global_epoch); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $comm_rounds_per_global_epoch)/x)*$iterations); print($iterations if y<$iterations else y)")
+echo batches per communication round: $batches_per_comm_round
+echo corrected iterations: $new_iterations
+test_after=$(($new_iterations / $global_epochs))
+echo test after: $test_after
+for i in "${tests[@]}"
+do
+  for seed in "${random_seeds[@]}"
+  do
+    echo $i
+    IFS='_' read -ra NAMES <<< $i
+    IFS='.' read -ra NAME <<< ${NAMES[-1]}
+    log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$comm_rounds_per_global_epoch:b=$batchsize:$(date '+%Y-%m-%dT%H:%M')/machine$m
+    echo results are stored in: $log_dir
+    mkdir -p $log_dir
+    cp $i $config_file
+    # changing the config files to reflect the values of the current grid search state
+    $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
+    $python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
+    $python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
+    $python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $batchsize
+    $python_bin/crudini --set $config_file DATASET random_seed $seed
+    $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
+    echo $i is done
+    sleep 200
+    echo end of sleep
+    done
+done
+#
\ No newline at end of file
--- a/eval/run_grid.sh
+++ b/eval/run_grid.sh
+#!/bin/bash
+# Documentation
+# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
+# The second one (python_bin) is the path to the python bin folder.
+# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
+#
+# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
+# called configs which contains the file 'ip_addr_6Machines.json'
+# The python bin folder needs to include all the dependencies of this project including crudini.
+# The results will be stored in $nfs_home/$logs_subfolder
+# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
+# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
+# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
+# was run.
+# Example: ./run_grid.sh  /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
+#
+# Additional requirements:
+# Each node needs a folder called 'tmp' in the user's home directory
+#
+# Note:
+# - The script does not change the optimizer. All configs are writen to use SGD.
+# - The script will set '--test_after' and '--train_evaluate_after' such that it happens at the end of a global epoch.
+# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
+#   relevant for Adams and other optimizers with internal state)
+#
+# Addapting the script to other datasets:
+# Change the variable 'dataset_size' to reflect the data sets size.
+#
+# Known issues:
+# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
+#   machines may start running the script at the exact same moment.
+
+nfs_home=$1
+python_bin=$2
+logs_subfolder=$3
+decpy_path=$nfs_home/decentralizepy/eval
+cd $decpy_path
+
+env_python=$python_bin/python3
+graph=96_regular.edges
+config_file=~/tmp/config.ini
+procs_per_machine=16
+machines=6
+global_epochs=25
+eval_file=testing.py
+log_level=INFO
+
+ip_machines=$nfs_home/configs/ip_addr_6Machines.json
+
+m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
+export PYTHONFAULTHANDLER=1
+
+# Base configs for which the gird search is done
+tests=("step_configs/config_celeba_sharing.ini")
+# Learning rates to test
+lrs=( "0.001" "0.0001" "0.0001")
+# Batch sizes to test
+batchsize=("8" "16")
+# The number of communication rounds per global epoch to test
+comm_rounds_per_global_epoch=("1" "5" "10")
+procs=`expr $procs_per_machine \* $machines`
+echo procs: $procs
+dataset_size=63741
+# Calculating the number of samples that each user/proc will have on average
+samples_per_user=`expr $dataset_size / $procs`
+echo samples per user: $samples_per_user
+
+for b in "${batchsize[@]}"
+do
+  echo batchsize: $b
+  for r in "${comm_rounds_per_global_epoch[@]}"
+  do
+    echo communication rounds per global epoch: $r
+    # calculating how many batches there are in a global epoch for each user/proc
+    batches_per_epoch=$(($samples_per_user / $b))
+    echo batches per global epoch: $batches_per_epoch
+    # the number of iterations in 25 global epochs
+    iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $r >= $batches_per_epoch else print($global_epochs * $r)")
+    echo iterations: $iterations
+    # calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
+    batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $r); print(1 if x==0 else x)")
+    # since the batches per communication round were rounded down we need to change the number of iterations to reflect that
+    new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $r); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $r)/x)*$iterations); print($iterations if y<$iterations else y)")
+    echo batches per communication round: $batches_per_comm_round
+    echo corrected iterations: $new_iterations
+    test_after=$(($new_iterations / $global_epochs))
+    echo test after: $test_after
+    for lr in "${lrs[@]}"
+    do
+      for i in "${tests[@]}"
+      do
+        echo $i
+        IFS='_' read -ra NAMES <<< $i
+        IFS='.' read -ra NAME <<< ${NAMES[-1]}
+        log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$r:b=$b:$(date '+%Y-%m-%dT%H:%M')/machine$m
+        echo results are stored in: $log_dir
+        mkdir -p $log_dir
+        cp $i $config_file
+        # changing the config files to reflect the values of the current grid search state
+        $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
+        $python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
+        $python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
+        $python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $b
+        $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
+        echo $i is done
+        sleep 1
+        echo end of sleep
+      done
+    done
+  done
+done
+#
+
--- a/eval/run_xtimes.sh
+++ b/eval/run_xtimes.sh
+#!/bin/bash
+# Documentation
+# Note: documentation was not written for this run file, so actual behaviour may differ
+# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
+# The second one (python_bin) is the path to the python bin folder.
+# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
+#
+# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
+# called configs which contains the file 'ip_addr_6Machines.json'
+# The python bin folder needs to include all the dependencies of this project including crudini.
+# The results will be stored in $nfs_home/$logs_subfolder
+# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
+# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
+# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
+# was run.
+# Example: ./run_grid.sh  /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
+#
+# Additional requirements:
+# Each node needs a folder called 'tmp' in the user's home directory
+#
+# Note:
+# - The script does not change the optimizer. All configs are writen to use Adam.
+#   For SGD these need to be changed manually
+# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
+#   on the train set and on the test set is carried out every global epoch.
+# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
+#   relevant for Adams and other optimizers with internal state)
+#
+# Addapting the script to other datasets:
+# Change the variable 'dataset_size' to reflect the data sets size.
+#
+# Known issues:
+# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
+#   machines may start running the script at the exact same moment.
+
+nfs_home=$1
+python_bin=$2
+logs_subfolder=$3
+decpy_path=$nfs_home/decentralizepy/eval
+cd $decpy_path
+
+env_python=$python_bin/python3
+config_file=~/tmp/config.ini
+procs_per_machine=16
+machines=6
+global_epochs=150
+eval_file=testingPeerSampler.py
+log_level=INFO
+ip_machines=$nfs_home/$logs_subfolder/ip_addr_6Machines.json
+m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
+export PYTHONFAULTHANDLER=1
+# Base configs for which the gird search is done
+tests="$nfs_home/$logs_subfolder/config.ini"
+#tests=("$nfs_home/$logs_subfolder/config_cifar_sharing.ini" "$nfs_home/$logs_subfolder/config_cifar_partialmodel.ini" "$nfs_home/$logs_subfolder/config_cifar_topkacc.ini" "$nfs_home/$logs_subfolder/config_cifar_topkaccRandomAlpha.ini" "$nfs_home/$logs_subfolder/config_cifar_subsampling.ini" "$nfs_home/$logs_subfolder/config_cifar_wavelet.ini" "$nfs_home/$logs_subfolder/config_cifar_waveletRandomAlpha.ini")
+#tests=("$nfs_home/$logs_subfolder/config_cifar_partialmodel.ini" "$nfs_home/$logs_subfolder/config_cifar_topkacc.ini" "$nfs_home/$logs_subfolder/config_cifar_topkaccRandomAlpha.ini" "$nfs_home/$logs_subfolder/config_cifar_subsampling.ini" "$nfs_home/$logs_subfolder/config_cifar_wavelet.ini" "$nfs_home/$logs_subfolder/config_cifar_waveletRandomAlpha.ini")
+#tests=("$nfs_home/$logs_subfolder/config_cifar_subsampling.ini" "$nfs_home/$logs_subfolder/config_cifar_sharing.ini" "$nfs_home/$logs_subfolder/config_cifar_waveletRandomAlpha.ini")
+#tests=("$nfs_home/$logs_subfolder/config_cifar_waveletRandomAlpha.ini")
+# Learning rates
+lr="0.01"
+# Batch size
+batchsize="8"
+# The number of communication rounds per global epoch
+comm_rounds_per_global_epoch="20"
+procs=`expr $procs_per_machine \* $machines`
+echo procs: $procs
+# Celeba has 63741 samples
+# Reddit has 70642
+# Femnist 734463
+# Shakespeares 3678451
+dataset_size=50000
+# Calculating the number of samples that each user/proc will have on average
+samples_per_user=`expr $dataset_size / $procs`
+echo samples per user: $samples_per_user
+# random_seeds for which to rerun the experiments
+# random_seeds=("90" "91" "92" "93" "94")
+random_seeds=("94")
+# random_seed = 97
+echo batchsize: $batchsize
+echo communication rounds per global epoch: $comm_rounds_per_global_epoch
+# calculating how many batches there are in a global epoch for each user/proc
+batches_per_epoch=$(($samples_per_user / $batchsize))
+echo batches per global epoch: $batches_per_epoch
+# the number of iterations in 25 global epochs
+iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $comm_rounds_per_global_epoch >= $batches_per_epoch else print($global_epochs * $comm_rounds_per_global_epoch)")
+echo iterations: $iterations
+# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
+batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $comm_rounds_per_global_epoch); print(1 if x==0 else x)")
+# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
+new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $comm_rounds_per_global_epoch); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $comm_rounds_per_global_epoch)/x)*$iterations); print($iterations if y<$iterations else y)")
+test_after=$(($new_iterations / $global_epochs))
+echo test after: $test_after
+echo batches per communication round: $batches_per_comm_round
+echo corrected iterations: $new_iterations
+for i in "${tests[@]}"
+do
+  for seed in "${random_seeds[@]}"
+  do
+    echo $i
+    IFS='_' read -ra NAMES <<< $i
+    IFS='.' read -ra NAME <<< ${NAMES[-1]}
+    #log_dir_base=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$comm_rounds_per_global_epoch:b=$batchsize:$(date '+%Y-%m-%dT%H:%M')
+    log_dir_base=$nfs_home/$logs_subfolder/lr=$lr:r=$comm_rounds_per_global_epoch:b=$batchsize:$(date '+%Y-%m-%dT%H:%M')
+    echo results are stored in: $log_dir_base
+    log_dir=$log_dir_base/machine$m
+    mkdir -p $log_dir
+    weight_store_dir=$log_dir_base/weights
+    mkdir -p $weight_store_dir
+    graph=$nfs_home/decentralizepy/eval/96_regular.edges
+    cp $i $config_file
+    # changing the config files to reflect the values of the current grid search state
+    $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
+    $python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
+    $python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
+    $python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $batchsize
+    $python_bin/crudini --set $config_file DATASET random_seed $seed
+    $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
+    $python_bin/crudini --set $config_file COMMUNICATION offset 10720
+    # $env_python $eval_file -cte 0 -ro 0 -tea $test_after -ld $log_dir -wsd $weight_store_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
+    $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
+
+    echo $i is done
+    sleep 200
+    echo end of sleep
+    done
+done
+#
--- a/eval/run_xtimes_celeba.sh
+++ b/eval/run_xtimes_celeba.sh
--- a/eval/run_xtimes_cifar.sh
+++ b/eval/run_xtimes_cifar.sh
--- a/eval/run_xtimes_femnist.sh
+++ b/eval/run_xtimes_femnist.sh
--- a/eval/run_xtimes_reddit.sh
+++ b/eval/run_xtimes_reddit.sh
--- a/eval/run_xtimes_shakespeare.sh
+++ b/eval/run_xtimes_shakespeare.sh
--- a/eval/step_configs/config_celeba_fft.ini
+++ b/eval/step_configs/config_celeba_fft.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Celeba
+dataset_class = Celeba
+model_class = CNN
+images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
+train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
+test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
+; python list of fractions below
+sizes = 
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 4
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.FFT
+sharing_class = FFT
+alpha = 0.1
+change_based_selection = True
+accumulation = True
--- a/eval/step_configs/config_celeba_grow.ini
+++ b/eval/step_configs/config_celeba_grow.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Celeba
+dataset_class = Celeba
+model_class = CNN
+images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
+train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
+test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
+; python list of fractions below
+sizes = 
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 4
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.GrowingAlpha
+sharing_class = GrowingAlpha
+init_alpha=0.10
+max_alpha=0.75
+k=6
+metadata_cap=0.65
--- a/eval/step_configs/config_celeba_manualadapt.ini
+++ b/eval/step_configs/config_celeba_manualadapt.ini
--- a/eval/step_configs/config_celeba_partialmodel.ini
+++ b/eval/step_configs/config_celeba_partialmodel.ini
--- a/eval/step_configs/config_celeba_randomalpha.ini
+++ b/eval/step_configs/config_celeba_randomalpha.ini
--- a/eval/step_configs/config_celeba_randomalphainc.ini
+++ b/eval/step_configs/config_celeba_randomalphainc.ini
--- a/eval/step_configs/config_celeba_roundrobin.ini
+++ b/eval/step_configs/config_celeba_roundrobin.ini
No results found