diff options
Diffstat (limited to 'tools/graphing.py')
| -rw-r--r-- | tools/graphing.py | 337 |
1 files changed, 337 insertions, 0 deletions
diff --git a/tools/graphing.py b/tools/graphing.py new file mode 100644 index 0000000..d2ea0c1 --- /dev/null +++ b/tools/graphing.py @@ -0,0 +1,337 @@ +import re + +import pandas as pd +import matplotlib.pyplot as plt +import os + +__FILENAMES__ = { + "foreign": "-foreign-", + "self": "-self-", + "download": "-throughput-download-", + "upload": "-throughput-upload-", + "granular": "-throughput-granular-", +} + + +def seconds_since_start(dfs, start, column_name="SecondsSinceStart"): + """ + Adds "Seconds Since Start" column to all DataFrames in List of DataFrames, + based on "CreationTime" column within them and start time passed. + + :param dfs: List of DataFrames. Each DataFrame MUST contain DateTime column named "CreationTime" + :param start: DateTime start time + :param column_name: String of column name to add, default "SecondsSinceStart" + :return: Inplace addition of column using passed column name + """ + for df in dfs: + df[column_name] = (df["CreationTime"]-start).apply(pd.Timedelta.total_seconds) + + +def find_earliest(dfs): + """ + Returns earliest DateTime in List of DataFrames based on "CreationTime" column within them. + ASSUMES DATAFRAMES ARE SORTED + + :param dfs: List of DataFrames. Each DataFrame MUST contain DateTime column named "CreationTime" and MUST BE SORTED by it. + :return: DateTime of earliest time within all dfs. + """ + earliest = dfs[0]["CreationTime"].iloc[0] + for df in dfs: + if df["CreationTime"].iloc[0] < earliest: + earliest = df["CreationTime"].iloc[0] + return earliest + + +def timeSinceStart(dfs, start): + """ + Adds "TimeSinceStart" column to all dataframes + :param dfs: + :param start: + :return: + """ + for df in dfs: + df["TimeSinceStart"] = df["CreationTime"]-start + +def probeClean(df): + # ConnRTT and ConnCongestionWindow refer to Underlying Connection + df.columns = ["CreationTime", "NumRTT", "Duration", "ConnRTT", "ConnCongestionWindow", "Type", "Empty"] + df = df.drop(columns=["Empty"]) + df["CreationTime"] = pd.to_datetime(df["CreationTime"], format="%m-%d-%Y-%H-%M-%S.%f") + df["Type"] = df["Type"].apply(str.strip) + df["ADJ_Duration"] = df["Duration"] / df["NumRTT"] + df = df.sort_values(by=["CreationTime"]) + return df + + +def throughputClean(df): + df.columns = ["CreationTime", "Throughput", "NumberConnections", "Empty"] + df = df.drop(columns=["Empty"]) + df["CreationTime"] = pd.to_datetime(df["CreationTime"], format="%m-%d-%Y-%H-%M-%S.%f") + df["ADJ_Throughput"] = df["Throughput"] / 1000000 + df = df.sort_values(by=["CreationTime"]) + return df + + +def granularClean(df): + df.columns = ["CreationTime", "Throughput", "ID", "Type", "Empty"] + df = df.drop(columns=["Empty"]) + df["CreationTime"] = pd.to_datetime(df["CreationTime"], format="%m-%d-%Y-%H-%M-%S.%f") + df["Type"] = df["Type"].apply(str.strip) + df["ADJ_Throughput"] = df["Throughput"] / 1000000 + df = df.sort_values(by=["CreationTime"]) + return df + + +def make90Percentile(df): + df = df.sort_values(by=["ADJ_Duration"]) + df = df.reset_index() + df = df.iloc[:int(len(df)*.9)] + df = df.sort_values(by=["CreationTime"]) + return df + + +def main(title, paths): + # Data Ingestion + foreign = pd.read_csv(paths["foreign"]) + self = pd.read_csv(paths["self"]) + download = pd.read_csv(paths["download"]) + upload = pd.read_csv(paths["upload"]) + granular = pd.read_csv(paths["granular"]) + + # Data Cleaning + foreign = probeClean(foreign) + self = probeClean(self) + download = throughputClean(download) + upload = throughputClean(upload) + granular = granularClean(granular) + + # Data Separation + selfUp = self[self["Type"] == "SelfUp"] + selfUp = selfUp.reset_index() + selfDown = self[self["Type"] == "SelfDown"] + selfDown = selfDown.reset_index() + granularUp = granular[granular["Type"] == "Upload"] + granularUp = granularUp.reset_index() + granularDown = granular[granular["Type"] == "Download"] + granularDown = granularDown.reset_index() + + + + # Moving Average + foreign["DurationMA5"] = foreign["ADJ_Duration"].rolling(window=5).mean() + selfUp["DurationMA5"] = selfUp["ADJ_Duration"].rolling(window=5).mean() + selfDown["DurationMA5"] = selfDown["ADJ_Duration"].rolling(window=5).mean() + + # Normalize + dfs = [foreign, selfUp, selfDown, download, upload, granularUp, granularDown] + timeSinceStart(dfs, find_earliest(dfs)) + seconds_since_start(dfs, find_earliest(dfs)) + + yCol = "SecondsSinceStart" + + def GraphNormal(): + ########## Graphing Complete + fig, ax = plt.subplots() + ax.set_title(title) + ax.plot(foreign[yCol], foreign["ADJ_Duration"], "b.", label="foreign") + ax.plot(selfUp[yCol], selfUp["ADJ_Duration"], "r.", label="selfUP") + ax.plot(selfDown[yCol], selfDown["ADJ_Duration"], "c.", label="selfDOWN") + ax.plot(foreign[yCol], foreign["DurationMA5"], "b--", label="foreignMA") + ax.plot(selfUp[yCol], selfUp["DurationMA5"], "r--", label="selfUPMA") + ax.plot(selfDown[yCol], selfDown["DurationMA5"], "c--", label="selfDOWNMA") + ax.set_ylim([0, max(foreign["ADJ_Duration"].max(), self["ADJ_Duration"].max())]) + ax.legend(loc="upper left") + + secax = ax.twinx() + secax.plot(download[yCol], download["ADJ_Throughput"], "g-", label="download (MB/s)") + secax.plot(granularDown[granularDown["ID"] == 0][yCol], granularDown[granularDown["ID"] == 0]["ADJ_Throughput"], "g--", label="Download Connection 0 (MB/S)") + secax.plot(upload[yCol], upload["ADJ_Throughput"], "y-", label="upload (MB/s)") + secax.plot(granularUp[granularUp["ID"] == 0][yCol], granularUp[granularUp["ID"] == 0]["ADJ_Throughput"], "y--", label="Upload Connection 0 (MB/S)") + secax.legend(loc="upper right") + #GraphNormal() + + def StackedThroughput(): + ########## Graphing Stacked + fig, ax = plt.subplots() + ax.set_title(title + " Granular Throughput") + # ax.plot(foreign[yCol], foreign["ADJ_Duration"], "b.", label="foreign") + # ax.plot(selfUp[yCol], selfUp["ADJ_Duration"], "r.", label="selfUP") + # ax.plot(selfDown[yCol], selfDown["ADJ_Duration"], "c.", label="selfDOWN") + # ax.plot(foreign[yCol], foreign["DurationMA5"], "b--", label="foreignMA") + # ax.plot(selfUp[yCol], selfUp["DurationMA5"], "r--", label="selfUPMA") + # ax.plot(selfDown[yCol], selfDown["DurationMA5"], "c--", label="selfDOWNMA") + # ax.set_ylim([0, max(foreign["ADJ_Duration"].max(), self["ADJ_Duration"].max())]) + # ax.legend(loc="upper left") + + secax = ax.twinx() + secax.plot(download[yCol], download["ADJ_Throughput"], "g-", label="download (MB/s)") + secax.plot(upload[yCol], upload["ADJ_Throughput"], "y-", label="upload (MB/s)") + + granularDown["bucket"] = granularDown["SecondsSinceStart"].round(0) + buckets = pd.DataFrame(granularDown["bucket"].unique()) + buckets.columns = ["bucket"] + buckets = buckets.set_index("bucket") + buckets["SecondsSinceStart"] = granularDown.drop_duplicates(subset=["bucket"]).reset_index()["SecondsSinceStart"] + buckets["bottom"] = 0 + for id in sorted(granularDown["ID"].unique()): + secax.bar(granularDown[yCol][granularDown["ID"] == id] + .05, + granularDown["ADJ_Throughput"][granularDown["ID"] == id], + width=.09, bottom=buckets.iloc[len(buckets) - len(granularDown[granularDown["ID"] == id]):]["bottom"] + ) + # ,label=f"Download Connection {id}") + buckets["toadd_bottom"] = (granularDown[granularDown["ID"] == id]).set_index("bucket")["ADJ_Throughput"] + buckets["toadd_bottom"] = buckets["toadd_bottom"].fillna(0) + buckets["bottom"] += buckets["toadd_bottom"] + + + granularUp["bucket"] = granularUp["SecondsSinceStart"].round(0) + buckets = pd.DataFrame(granularUp["bucket"].unique()) + buckets.columns = ["bucket"] + buckets = buckets.set_index("bucket") + buckets["SecondsSinceStart"] = granularUp.drop_duplicates(subset=["bucket"]).reset_index()["SecondsSinceStart"] + buckets["bottom"] = 0 + for id in sorted(granularUp["ID"].unique()): + secax.bar(granularUp[yCol][granularUp["ID"] == id] - .05, granularUp["ADJ_Throughput"][granularUp["ID"] == id], + width=.09, bottom=buckets.iloc[len(buckets) - len(granularUp[granularUp["ID"] == id]):]["bottom"] + ) + #,label=f"Upload Connection {id}") + buckets["toadd_bottom"] = (granularUp[granularUp["ID"] == id]).set_index("bucket")["ADJ_Throughput"] + buckets["toadd_bottom"] = buckets["toadd_bottom"].fillna(0) + buckets["bottom"] += buckets["toadd_bottom"] + secax.legend(loc="upper right") + + + secax.legend(loc="upper left") + + #StackedThroughput() + stacked_bar_throughput(upload, granularUp, "SecondsSinceStart", "ADJ_Throughput", title + " Upload Stacked", + "Upload Throughput MB/s") + stacked_bar_throughput(download, granularDown, "SecondsSinceStart", "ADJ_Throughput", title + " Download Stacked", + "Download Throughput MB/s") + + def Percent90(): + ######### Graphing Removing 90th Percentile + nonlocal selfUp + nonlocal selfDown + nonlocal foreign + selfUp = make90Percentile(selfUp) + selfDown = make90Percentile(selfDown) + foreign = make90Percentile(foreign) + + # Recalculate MA + foreign["DurationMA5"] = foreign["ADJ_Duration"].rolling(window=5).mean() + selfUp["DurationMA5"] = selfUp["ADJ_Duration"].rolling(window=5).mean() + selfDown["DurationMA5"] = selfDown["ADJ_Duration"].rolling(window=5).mean() + + # Graphing Complete + fig, ax = plt.subplots() + ax.set_title(title + " 90th Percentile (ordered lowest to highest duration)") + # ax.plot(foreign[yCol], foreign["ADJ_Duration"], "b.", label="foreign") + # ax.plot(selfUp[yCol], selfUp["ADJ_Duration"], "r.", label="selfUP") + # ax.plot(selfDown[yCol], selfDown["ADJ_Duration"], "c.", label="selfDOWN") + ax.plot(foreign[yCol], foreign["DurationMA5"], "b--", label="foreignMA") + ax.plot(selfUp[yCol], selfUp["DurationMA5"], "r--", label="selfUPMA") + ax.plot(selfDown[yCol], selfDown["DurationMA5"], "c--", label="selfDOWNMA") + ax.set_ylim([0, max(foreign["ADJ_Duration"].max(), selfUp["ADJ_Duration"].max(), selfDown["ADJ_Duration"].max())]) + ax.legend(loc="upper left") + + secax = ax.twinx() + secax.plot(download[yCol], download["ADJ_Throughput"], "g-", label="download (MB/s)") + secax.plot(granularDown[granularDown["ID"] == 0][yCol], granularDown[granularDown["ID"] == 0]["ADJ_Throughput"], + "g--", label="Download Connection 0 (MB/S)") + secax.plot(upload[yCol], upload["ADJ_Throughput"], "y-", label="upload (MB/s)") + secax.plot(granularUp[granularUp["ID"] == 0][yCol], granularUp[granularUp["ID"] == 0]["ADJ_Throughput"], "y--", + label="Upload Connection 0 (MB/S)") + secax.legend(loc="upper right") + + Percent90() + +def stacked_bar_throughput(df, granular, xcolumn, ycolumn, title, label): + fig, ax = plt.subplots() + ax.set_title(title) + + secax = ax.twinx() + ax.get_yaxis().set_visible(False) + ax.set_xlabel("Seconds Since Start (s)") + secax.set_ylabel("Throughput (MB/s)") + # secax.set_xticks(range(0, round(granular[xcolumn].max()) + 1)) # Ticks every 1 second + + # Plot Main Throughput + secax.plot(df[xcolumn], df[ycolumn], "k--", label=label) + + df_gran = granular.copy() + # df_gran["bucket"] = df_gran[xcolumn].round(0) # With rounding + df_gran["bucket"] = df_gran[xcolumn] # Without rounding (csv creation time points need to be aligned) + buckets = pd.DataFrame(df_gran["bucket"].unique()) + buckets.columns = ["bucket"] + buckets = buckets.set_index("bucket") + buckets[xcolumn] = df_gran.drop_duplicates(subset=["bucket"]).reset_index()[xcolumn] + buckets["bottom"] = 0 + for id in sorted(df_gran["ID"].unique()): + secax.bar(df_gran[xcolumn][df_gran["ID"] == id], + df_gran[ycolumn][df_gran["ID"] == id], + width=.25, bottom=buckets.iloc[len(buckets) - len(df_gran[df_gran["ID"] == id]):]["bottom"] + ) + # ,label=f"Download Connection {id}") + buckets["toadd_bottom"] = (df_gran[df_gran["ID"] == id]).set_index("bucket")[ycolumn] + buckets["toadd_bottom"] = buckets["toadd_bottom"].fillna(0) + buckets["bottom"] += buckets["toadd_bottom"] + + secax.legend(loc="upper right") + + +def findFiles(dir): + matches = {} + + files = os.listdir(dir) + for file in files: + if os.path.isfile(dir+file): + for name in __FILENAMES__: + regex = "(?P<start>.*)(?P<type>" + __FILENAMES__[name] + ")(?P<end>.*)" + match = re.match(regex, file) + if match is not None: + start = match.group("start") + end = match.group("end") + if start not in matches: + matches[start] = {} + if end not in matches[start]: + matches[start][end] = {} + if name in matches[start][end]: + print("ERROR ALREADY FOUND A FILE THAT HAS THE SAME MATCHING") + matches[start][end][name] = dir+file + return matches + +def generatePaths(): + return { + "foreign": "", + "self": "", + "download": "", + "upload": "", + "granular": "", + } + +def makeGraphs(files): + for start in files: + x = 0 + for end in files[start]: + # Check if it contains all file fields + containsALL = True + for key in __FILENAMES__: + if key not in files[start][end]: + containsALL = False + # If we don't have all files then loop to next one + if not containsALL: + continue + + main(start + " - " + str(x), files[start][end]) + x += 1 + +# Press the green button in the gutter to run the script. +if __name__ == '__main__': + paths = generatePaths() + + files = findFiles("./Data/WillTest/") + print(files) + makeGraphs(files) + + plt.show()
\ No newline at end of file |
