Skip to content

Commit

Permalink
Bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
maxwelltsai committed May 27, 2020
1 parent 4b39218 commit 429d5c5
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 16 deletions.
41 changes: 30 additions & 11 deletions deep_galaxy_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import time
import argparse
import logging
import importlib
import importlib

try:
import horovod.keras as hvd
Expand Down Expand Up @@ -90,10 +90,8 @@ def initialize(self):
# sess = tf.compat.v1.Session(config=config)
# tf.compat.v1.keras.backend.set_session(sess) # set this TensorFlow session as the default session for Keras

# Create logger
self.logger = logging.getLogger('DeepGalaxyTrain')
self.logger.setLevel(self.log_level)
self.logger.addHandler(logging.FileHandler('train_log.txt'))
tf.keras.backend.set_image_data_format('channels_last')

if self.distributed_training is True:
try:
import horovod.tensorflow.keras as hvd
Expand All @@ -103,12 +101,17 @@ def initialize(self):
self.callbacks.append(hvd.callbacks.MetricAverageCallback())
# self.callbacks = [hvd.BroadcastGlobalVariablesHook(0)]
if hvd.rank() == 0:
# Create logger
self.logger = logging.getLogger('DeepGalaxyTrain')
self.logger.setLevel(self.log_level)
self.logger.addHandler(logging.FileHandler('train_log.txt'))
self.logger.info('Parallel training enabled.')
self.logger.info('batch_size = %d, global_batch_size = %d, num_workers = %d\n' % (self.batch_size, self.batch_size*hvd.size(), hvd.size()))

# Map an MPI process to a GPU (Important!)
print('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank()))
self.logger.info('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank()))
if hvd.rank() == 0:
self.logger.info('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank()))

# Bind a CUDA device to one MPI process (has no effect if GPUs are not used)
os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())
Expand All @@ -130,7 +133,16 @@ def initialize(self):
except ImportError as identifier:
print('Error importing horovod. Disabling distributed training.')
self.distributed_training = False
self.logger = logging.getLogger('DeepGalaxyTrain')
self.logger.setLevel(self.log_level)
self.logger.addHandler(logging.FileHandler('train_log.txt'))
self.logger.info('Parallel training disabled.')
self.logger.info('Batch_size = %d' % (self.batch_size))
else:
# Create logger
self.logger = logging.getLogger('DeepGalaxyTrain')
self.logger.setLevel(self.log_level)
self.logger.addHandler(logging.FileHandler('train_log.txt'))
self.logger.info('Parallel training disabled.')
self.logger.info('Batch_size = %d' % (self.batch_size))

Expand All @@ -139,11 +151,14 @@ def load_data(self, data_fn, dset_name_pattern, camera_pos, test_size=0.2, rando
if not self.distributed_training:
self.logger.info('Loading the full dataset since distributed training is disabled ...')
X, Y = self.data_io.load_all(data_fn, dset_name_pattern=dset_name_pattern, camera_pos=camera_pos)
self.logger.debug('Shape of X: %s' % str(X.shape))
self.logger.debug('Shape of Y: %s' % str(Y.shape))
else:
self.logger.info('Loading part of the dataset since distributed training is enabled ...')
X, Y = self.data_io.load_partial(data_fn, dset_name_pattern=dset_name_pattern, camera_pos=camera_pos, hvd_size=hvd.size(), hvd_rank=hvd.rank())
self.logger.debug('Shape of X: %s' % str(X.shape))
self.logger.debug('Shape of Y: %s' % str(Y.shape))
if hvd.rank() == 0:
self.logger.info('Loading part of the dataset since distributed training is enabled ...')
self.logger.debug('Shape of X: %s' % str(X.shape))
self.logger.debug('Shape of Y: %s' % str(Y.shape))

# update the input_shape setting according to the loaded data
self.input_shape = X.shape[1:]
Expand All @@ -158,15 +173,19 @@ def load_data(self, data_fn, dset_name_pattern, camera_pos, test_size=0.2, rando
self.x_train = X
self.y_train = Y
self.num_classes = np.unique(Y).shape[0]
self.logger.debug('Number of classes: %d' % self.num_classes)
if not self.distributed_training:
self.logger.debug('Number of classes: %d' % self.num_classes)
else:
if hvd.rank() == 0:
self.logger.debug('Number of classes: %d' % self.num_classes)

def load_model(self):
# if not os.path.isfile('efn_b4.h5'):
# base_model = efn.EfficientNetB4(weights=None, include_top=True, input_shape=(self.input_shape[0], self.input_shape[1], 3), classes=self.num_classes)
# base_model.save('efn_b4.h5')
# else:
# base_model = tf.keras.models.load_model('efn_b4.h5', compile=False)

if 'EfficientNet' in self.base_model_name:
base_model = getattr(efn, self.base_model_name)(weights=None, include_top=True, input_shape=(self.input_shape[0], self.input_shape[1], 3), classes=self.num_classes)
else:
Expand Down
10 changes: 5 additions & 5 deletions dg_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# Dynamically creating training sessions with commandline arguments.
"""

import argparse
import argparse
from deep_galaxy_training import DeepGalaxyTraining


Expand All @@ -26,7 +26,7 @@
parser.add_argument('--allow-growth', dest='allow_growth', action='store_true', default=True, help='Allow GPU memory to grow dypnamically according to the size of the model.')
parser.add_argument('--gpu-mem-frac', dest='gpu_mem_frac', type=float, default=None, help='Fraction of GPU memory to allocate per process. If None, this is handled automaticaly. If a number > 1 is given, unified memory is used.')
parser.add_argument('--no-distributed', dest='distributed', action='store_false', help='Turn off Horovid distributed training')
parser.add_argument('--noise', dest='noise_stddev', default=0.2, help='The stddev of the Gaussian noise for mitigatyying overfitting')
parser.add_argument('--noise', dest='noise_stddev', type=float, default=0.2, help='The stddev of the Gaussian noise for mitigatyying overfitting')
parser.add_argument('--num-camera', dest='num_cam', type=int, default=3, help='Number of camera positions (for data augmentation). Choose an integer between 1 and 14')
args = parser.parse_args()

Expand All @@ -37,12 +37,12 @@
dgtrain.multi_gpu_training = args.multi_gpu
dgtrain.base_model_name = args.dnn_arch
dgtrain.noise_stddev = args.noise_stddev
dgtrain.batch_size = args.batch_size
dgtrain.learning_rate = args.lr
dgtrain.batch_size = args.batch_size
dgtrain.learning_rate = args.lr
dgtrain.epochs = args.epochs
dgtrain._gpu_memory_allow_growth = args.allow_growth
if args.gpu_mem_frac is None:
dgtrain._gpu_memory_fraction = None
dgtrain._gpu_memory_fraction = None
else:
dgtrain._gpu_memory_fraction = float(args.gpu_mem_frac)
dgtrain.initialize()
Expand Down

0 comments on commit 429d5c5

Please sign in to comment.