Merge branch 'main' of github.com:hipersys-team/lightning

hipersys-team · Aug 22, 2023 · 6bbe02c · 6bbe02c
2 parents 9119c13 + b749719
commit 6bbe02c
Show file tree

Hide file tree

Showing 87 changed files with 1,416 additions and 0 deletions.
diff --git a/simulation/.gitignore b/simulation/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/simulation/README.md b/simulation/README.md
@@ -0,0 +1,52 @@
+# Lightning Simulation Code
+The Lightning simulations compare the performance of large DNNs on Lightning against their performance on GPUs and AI accelerators. The code in this folder reproduces results reported in Section 9 of the Lightning paper.
+
+## Overview
+Event-driven simulation occurs in two phases:
+
+1. Scheduling DNN requests. `build_sim_for_mixed_arrivals()` does this by scheduling a particular number of different inference requests (spaced based on their associated DNN input sizes using a Poisson distribution).
+
+2. Simulating the schedule. `Simulator.simulate()` will act out the requests at their specified times and return the average completion times per model.
+
+Supports comparisons between Lightning, NVIDIA's A100 and A100X, and Microsoft Brainwave.
+
+## Folder structure
+|  Source Files               |Description                                                                                                                    |
+|  -----                      |  -----                                                                                                                          |
+|  `orders/` |  Different mixed orders of DNN requests to be converted to schedules   |
+|  `sim_scheds/` |   Schedules of DNN request traces for simulation (specific to a network speed)     |
+|  `congestion_plot.py` |  Plots the active DNN requests over time of a finished simulation |
+|  `csv_gen.sh` |   Converts the simulator's trial outputs to CSV format (in `results/`) for further analysis   |
+|  `dnn_classes.py` |   Foundational class structures for representing deep neural network (DNN) architectures |
+|  `final_gen.sh` |  Bash utility for batching process reading and processing of CSV files using the `read_csv.py` Python script |
+|  `gen_mixed.py` |   Converts DNN request order into a network speed-specific schedule for simulation. |
+|  `make_order.py` |   Generates and saves a random order of DNN requests  |
+|  `models.py` |   Provides a way to generate and represent the layers for popular DNNs such as BERT-Large, GPT-2 (Extra-Large), LeNet-300-100, AlexNet, ResNet-18, VGG-16, VGG-19, and Meta's DLRM  |
+|  `read_csv.py` |  Processes runtime data for DNNs executed on different processors and then stores the results in a TSV-formatted file   |
+|  `README.md` |  This file, describing the requirements and instructions to reproduce the experiments.                |
+|  `requirements.txt` |  List of all of the Python dependencies |
+|  `run.sh` |   Conducts a series of simulations (via the `sim.py` script) on different types of processors |
+|  `sched_gen.sh` |   Generates multiple DNN request traces for simulations |
+|  `sim_classes.py` |   Useful data structures for simulation  |
+|  `sim.py` |  Event-driven simulator code |
+|  `trial_to_csv.py` |  Parses trial file and extracts information about the simulation, specifically the average request completion times, total runtime, and the active request count over time, and stores that in CSV format. |
+|  `utils.py` |  Utility functions for simulator  |
+
+## Usage
+
+### 1. Install requirements
+Verify that you have Python3 set up and then install necessary packages with `python3 -m pip install -r requirements.txt`.
+
+### 2. Launch simulations
+Run simulations in parallel using `bash run.sh` (default configuration: 10 unique DNN traces over a 60Mbps network). Note: `run.sh` launches 40 simulations in parallel.
+
+If you'd like new schedules, execute `sched_gen.sh` before `run.sh`.
+
+### 3. Convert logs to CSVs
+Once the 40 simulations initiated by `run.sh` are complete, execute `bash csv_gen.sh` to parse the trial logs.
+
+### 4. Generate average speedups by DNN model
+Run `bash final_gen.sh` to generate a TSV-formatted file with the average runtimes of each DNN model over each processor. 
+
+### 5. Other useful information
+`congestion_plot.py` provide plots for active DNN request count over time for a single simulation. Be sure to read its ParseOpt functions to pass in the correct arguments. `job_stats/` includes the start and end times for each DNN layer of any past simulation.
diff --git a/simulation/congestion_plot.py b/simulation/congestion_plot.py
@@ -0,0 +1,76 @@
+from typing import Tuple, Dict, List
+import matplotlib.pyplot as plt
+import argparse
+
+def congestion_plot(lightning_over_t:List[Tuple[int,int]], \
+                    a100_over_t:List[Tuple[int,int]], \
+                    dpu_over_t:List[Tuple[int,int]], \
+                    brainwave_over_t:List[Tuple[int,int]], \
+                    out_filepath:str,
+                    network_speed:float) -> None:
+    lightning_x_1 = [t[0] for t in lightning_over_t]
+    lightning_y_1 = [t[1] for t in lightning_over_t]
+    a100_x_1 = [t[0] for t in a100_over_t]
+    a100_y_1 = [t[1] for t in a100_over_t]
+    a100x_x_1 = [t[0] for t in dpu_over_t]
+    a100x_y_1 = [t[1] for t in dpu_over_t]
+    brainwave_x_1 = [t[0] for t in brainwave_over_t]
+    brainwave_y_1 = [t[1] for t in brainwave_over_t]
+
+    fig, ax = plt.subplots()
+    ax.plot(a100_x_1, a100_y_1, label="A100")
+    ax.plot(a100x_x_1, a100x_y_1, label="A100X")
+    ax.plot(brainwave_x_1, brainwave_y_1, label="Brainwave")
+    ax.plot(lightning_x_1, lightning_y_1, label="Lightning")
+
+    ax.set_xlabel('Time (ns)')
+    ax.set_ylabel('Active Requests')
+    ax.set_title(f'Active Requests vs Time for {network_speed}Gbps')
+    ax.legend()
+    plt.xscale("log")
+
+    plt.savefig(out_filepath)
+    print(f"Output accessible in {out_filepath}")
+
+def ParseOpt(known=False):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch_size', type=int, help="maximum batch size for processor")
+    parser.add_argument('--lightning_core_count', type=int, help="number of cores for Lightning")
+    parser.add_argument('--num_reqs', type=int, help="exact number of requests to simulate")
+    parser.add_argument('--network_speed', type=str, help="network speed (in Gbps)")
+    parser.add_argument('--pkl_num', type=int, help="request schedule pickle file identifier")
+    parser.add_argument('--preemptive', type=str, help='P if preemptive scheduling (NP otherwise)')
+    opt = parser.parse_known_args()[0] if known else parser.parse_args()
+
+    return opt
+
+if __name__=="__main__":
+    opt = ParseOpt()
+
+    lightning_over_t = []
+    a100_over_t = []
+    dpu_over_t = []
+    brainwave_over_t = []
+    network_speed= opt.network_speed
+    with open(f"./results/active_reqs/lightning_{network_speed}_Gbps_l{opt.lightning_core_count}_cores_{opt.num_reqs}_reqs_{opt.batch_size}_BS_{opt.pkl_num}.csv", "r") as file:
+        lines = file.readlines()
+        for line in lines:
+            line_as_list = line.split(',')
+            lightning_over_t.append((float(line_as_list[0]),int(line_as_list[1])))
+    with open(f"./results/active_reqs/a100_{network_speed}_Gbps_l{opt.lightning_core_count}_cores_{opt.num_reqs}_reqs_{opt.batch_size}_BS_{opt.pkl_num}.csv", "r") as file:
+        lines = file.readlines()
+        for line in lines:
+            line_as_list = line.split(',')
+            a100_over_t.append((float(line_as_list[0]),int(line_as_list[1])))
+    with open(f"./results/active_reqs/dpu_{network_speed}_Gbps_l{opt.lightning_core_count}_cores_{opt.num_reqs}_reqs_{opt.batch_size}_BS_{opt.pkl_num}.csv", "r") as file:
+        lines = file.readlines()
+        for line in lines:
+            line_as_list = line.split(',')
+            dpu_over_t.append((float(line_as_list[0]),int(line_as_list[1])))
+    with open(f"./results/active_reqs/brainwave_{network_speed}_Gbps_l{opt.lightning_core_count}_cores_{opt.num_reqs}_reqs_{opt.batch_size}_BS_{opt.pkl_num}.csv", "r") as file:
+        lines = file.readlines()
+        for line in lines:
+            line_as_list = line.split(',')
+            brainwave_over_t.append((float(line_as_list[0]),int(line_as_list[1])))
+    out_filepath = f"final_{opt.preemptive}_request_count_vs_time_{network_speed}_Gbps_l{opt.lightning_core_count}_{opt.batch_size}_BS_{opt.num_reqs}_reqs_{opt.pkl_num}.png"
+    congestion_plot(lightning_over_t, a100_over_t, dpu_over_t, brainwave_over_t, out_filepath, opt.network_speed)
diff --git a/simulation/csv_gen.sh b/simulation/csv_gen.sh
@@ -0,0 +1,16 @@
+NETWORK_SPEED="0.06"
+NUM_REQS=100
+LIGHTNING_BATCH_SIZE=1
+LIGHTNING_CORE_COUNT=576
+
+mkdir results
+mkdir results/runtimes
+mkdir results/active_reqs
+
+for PKL_NUM in 1 2 3 4 5 6 7 8 9 10
+do
+    for PROCESSOR in "lightning" "a100" "dpu" "brainwave"
+    do
+        python3 trial_to_csv.py --lightning_core_count=$LIGHTNING_CORE_COUNT --batch_size=$LIGHTNING_BATCH_SIZE --num_reqs=$NUM_REQS --network_speed=$NETWORK_SPEED --pkl_num=$PKL_NUM --processor=$PROCESSOR &
+    done
+done
diff --git a/simulation/dnn_classes.py b/simulation/dnn_classes.py
@@ -0,0 +1,79 @@
+from typing import Tuple, List, Dict, Set
+
+class Layer():
+    '''
+    Layer of DNN
+    '''
+    def __init__(self, id, prereqs, children):
+        self.id = id
+        self.prereqs = prereqs
+        self.children = children
+
+
+class ConvLayer(Layer):
+    '''
+    Convolutional layer of DNN
+    '''
+    def __init__(self, id:int, input_channels:int, kernel_size:int, output_shape:Tuple[int,int,int,int], prereqs=set(), children=[]) -> None:
+        '''
+        Parameters
+        ----------
+        input_channels: number of channels of input to layer
+        kernel_size: size of kernel edge used in convolution on input in this layer
+        output_shape: tuple (batch_num, output_channels, output_height, output_width)
+        '''
+        super().__init__(id, prereqs, children)
+        self.input_channels = input_channels
+        self.kernel_size = kernel_size
+        self.output_shape = output_shape
+
+
+class FCLayer(Layer):
+    '''
+    Fully-connected layer of DNN
+    '''
+    def __init__(self, id:int, input_size:int, output_size:int, prereqs=set(), children=[]) -> None:
+        '''
+        Parameters
+        ----------
+        input_size: size of vectors being multiplied in layer
+        output_size: number of products computed in layer
+        '''
+        super().__init__(id, prereqs, children)
+        self.input_size = input_size
+        self.output_size = output_size
+
+
+class Model():
+    '''
+    DNN Architecture
+    '''
+    def __init__(self, input_dims:Tuple[int,int], input_channels:float, layers:List[Layer]) -> None:
+        '''
+        Parameters
+        ----------
+        input_dims: height and width of input
+        input_channels: number of bytes the channels occupy
+        layers: list of layers that make up DNN
+        '''
+        self.input_size = input_dims[0] * input_dims[1] * input_channels # in bytes
+        self.layers = layers
+
+
+class ReadableModel():
+    '''
+    DNN in simulation-friendly format
+    '''
+    def __init__(self, name:str, layer_index:Dict[int,Tuple[int,int,List[int]]], prereqs:Dict[int,Set[int]], independent_layers:Set[int]) -> None:
+        '''
+        Parameters
+        ----------
+        name: name of DNN
+        layer_index: table of layer id -> (vector_len, VVPs, children)
+        prereqs: table of layer id -> layers they're dependent on (that haven't been computed yet)
+        independent_layers: set of layers that don't have any prereqs
+        '''
+        self.name = name
+        self.layer_index = layer_index
+        self.prereqs = prereqs
+        self.independent_layers = independent_layers
diff --git a/simulation/final_gen.sh b/simulation/final_gen.sh
@@ -0,0 +1,8 @@
+CORES=576
+BS=1
+PREEMPTIVE="P"
+
+for NS in 0.06
+do
+    python3 read_csv.py --num_reqs=100 --lightning_core_count=$CORES --network_speed=$NS --batch_size=$BS --preemptive=$PREEMPTIVE &
+done
diff --git a/simulation/gen_mixed.py b/simulation/gen_mixed.py
@@ -0,0 +1,49 @@
+from typing import List, Tuple
+from models import MODELS
+import numpy as np
+import pickle
+import argparse
+
+def gen_mixed_arrivals(order:List[str], network_speed:float, poisson=True) -> List[Tuple[str,float]]:
+    '''
+    Generates a sequence of mixed arrivals
+
+    Parameters
+    ----------
+    order: names of models in the order to be scheduled
+    network_speed: network speed in Gbps
+    poisson: whether interarrivals should be randomly distributed (otherwise even)
+
+    Returns
+    -------
+    schedule: sequence of mixed arrivals (model names and their arrival times in ns)
+    '''
+    schedule = []
+    time = 0
+    for model_name in order:
+        bit_stream = MODELS[model_name].input_size*8 # bytes => bits
+        interarrival_space = bit_stream / network_speed # in ns
+        if poisson:
+            time += round(np.random.exponential(interarrival_space))
+        else:
+            time += interarrival_space
+        schedule.append((model_name, time))
+    return schedule
+
+def ParseOpt(known=False):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--network_speed', type=float, help="network speed (in Gbps)")
+    parser.add_argument('--pkl_num', type=int, help="request schedule pickle file identifier")
+    opt = parser.parse_known_args()[0] if known else parser.parse_args()
+
+    return opt
+
+if __name__=="__main__":
+    opt = ParseOpt()
+    with open(f"./orders/order_{opt.pkl_num}.pkl", "rb") as file:
+        rand_order = pickle.load(file)
+    sched = gen_mixed_arrivals(rand_order, opt.network_speed)
+    sched_filename = f'sim_scheds/mixed_sched_{opt.network_speed}_Gbps_{opt.pkl_num}.pkl'
+    with open(sched_filename, 'wb') as file:
+        pickle.dump(sched, file)
+    print(f"Schedule accessible at {sched_filename}")
diff --git a/simulation/make_order.py b/simulation/make_order.py
@@ -0,0 +1,34 @@
+from typing import List
+import argparse
+import random
+import pickle
+
+def gen_random_order(num_reqs:int, possible_models:List[str]) -> List[str]:
+    '''
+    Generates a random order of DNN requests
+
+    Parameters
+    ----------
+    num_reqs: number of request in order
+    possible_models: names of DNNs to randomly schedule
+
+    Returns
+    -------
+    o: ordering of `num_reqs` number of DNN requests
+    '''
+    return [random.choice(possible_models) for _ in range(num_reqs)]
+
+def ParseOpt(known=False):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_reqs', type=int, help="exact number of requests to simulate")
+    parser.add_argument('--pkl_num', type=int, help="request order pickle file identifier")
+    opt = parser.parse_known_args()[0] if known else parser.parse_args()
+
+    return opt
+
+if __name__=="__main__":
+    opt = ParseOpt()
+    possible_models = ["AlexNet", "ResNet-18", "VGG-16", "VGG-19", "BERT", "GPT-2", "DLRM"]
+    rand_order = gen_random_order(opt.num_reqs, possible_models)
+    with open(f'orders/order_{opt.pkl_num}.pkl', 'wb') as file:
+        pickle.dump(rand_order, file)