Skip to content
Snippets Groups Projects
Commit 0c2b8989 authored by Jeffrey Wigger's avatar Jeffrey Wigger
Browse files

fixed from_torch bug in wavelet; fixed circular dependency deadlock in fft and wavelet;

added run_all.sh

run_all.sh and more config files crudini

moving everything to sharing
parent 72a9f1a6
No related branches found
No related tags found
No related merge requests found
Showing
with 413 additions and 26 deletions
#!/bin/bash
nfs_home=$1
python_bin=$2
decpy_path=$nfs_home/decentralizepy/eval
cd $decpy_path
env_python=$python_bin/python3
graph=96_regular.edges #4_node_fullyConnected.edges
config_file=~/tmp/config.ini
procs_per_machine=16
machines=6
iterations=5
test_after=21 # we do not test
eval_file=testing.py
log_level=INFO
ip_machines=$nfs_home/configs/ip_addr_6Machines.json
m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
export PYTHONFAULTHANDLER=1
tests=("step_configs/config_celeba.ini" "step_configs/config_celeba_100.ini" "step_configs/config_celeba_fft.ini" "step_configs/config_celeba_wavelet.ini"
"step_configs/config_celeba_grow.ini" "step_configs/config_celeba_manualadapt.ini" "step_configs/config_celeba_randomalpha.ini"
"step_configs/config_celeba_randomalphainc.ini" "step_configs/config_celeba_roundrobin.ini" "step_configs/config_celeba_subsampling.ini"
"step_configs/config_celeba_topkrandom.ini" "step_configs/config_celeba_topkacc.ini" "step_configs/config_celeba_topkparam.ini")
for i in "${tests[@]}"
do
echo $i
IFS='_' read -ra NAMES <<< $i
IFS='.' read -ra NAME <<< ${NAMES[-1]}
log_dir=$nfs_home/logs/testing/${NAME[0]}$(date '+%Y-%m-%dT%H:%M')/machine$m
mkdir -p $log_dir
cp $i $config_file
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$env_python $eval_file -ro 0 -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 3
echo end of sleep
done
......@@ -2,9 +2,9 @@
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /home/risharma/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /home/risharma/leaf/data/celeba/per_user_data/train
test_dir = /home/risharma/leaf/data/celeba/data/test
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
......@@ -14,11 +14,11 @@ optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.GradientAccumulator
training_class = GradientAccumulator
rounds = 20
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 64
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
......
......@@ -2,9 +2,9 @@
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /home/risharma/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /home/risharma/leaf/data/celeba/per_user_data/train
test_dir = /home/risharma/leaf/data/celeba/data/test
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
......@@ -16,9 +16,9 @@ lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 20
rounds = 4
full_epochs = False
batch_size = 64
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
......
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.FFT
sharing_class = FFT
alpha = 0.1
change_based_selection = True
accumulation = True
......@@ -2,9 +2,9 @@
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /home/risharma/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /home/risharma/leaf/data/celeba/per_user_data/train
test_dir = /home/risharma/leaf/data/celeba/data/test
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
......@@ -14,11 +14,11 @@ optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.GradientAccumulator
training_class = GradientAccumulator
rounds = 20
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 64
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
......
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.ManualAdapt
sharing_class = ManualAdapt
change_alpha = [0.1, 0.5]
change_rounds = [10,30]
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.RandomAlpha
sharing_class = RandomAlpha
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.RandomAlphaIncremental
sharing_class = RandomAlphaIncremental
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.RoundRobinPartial
sharing_class = RoundRobinPartial
alpha = 0.1
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.SubSampling
sharing_class = SubSampling
alpha = 0.1
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.PartialModel
sharing_class = PartialModel
alpha = 0.1
accumulation = True
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.TopKParams
sharing_class = TopKParams
alpha = 0.1
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.TopKPlusRandom
sharing_class = TopKPlusRandom
alpha = 0.1
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.Wavelet
sharing_class = Wavelet
change_based_selection = True
alpha = 0.1
wavelet=sym2
level= None
accumulation = True
......@@ -14,8 +14,8 @@ optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.GradientAccumulator
training_class = GradientAccumulator
training_package = decentralizepy.training.Training
training_class = Training
rounds = 47
full_epochs = False
batch_size = 16
......
......@@ -13,8 +13,8 @@ optimizer_class = Adam
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.GradientAccumulator
training_class = GradientAccumulator
training_package = decentralizepy.training.Training
training_class = Training
rounds = 20
full_epochs = False
batch_size = 64
......
......@@ -23,7 +23,6 @@ batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
accumulation = True
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
......@@ -33,4 +32,5 @@ addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.PartialModel
sharing_class = PartialModel
alpha = 0.1
\ No newline at end of file
alpha = 0.1
accumulation = True
\ No newline at end of file
......@@ -65,3 +65,4 @@ if __name__ == "__main__":
args.reset_optimizer,
],
)
print("after spawn")
......@@ -44,6 +44,7 @@ install_requires =
localconfig
PyWavelets
pandas
crudini
include_package_data = True
python_requires = >=3.6
[options.packages.find]
......
......@@ -14,7 +14,7 @@ class Model(nn.Module):
"""
super().__init__()
self.accumulated_gradients = []
self.model_change = None
self._param_count_ot = None
self._param_count_total = None
self.accumulated_changes = None
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment