From 058276493fc7e3609b4d9fa65e04ffe7da221b7e Mon Sep 17 00:00:00 2001 From: Kevin Hoxha Date: Mon, 25 Jul 2022 10:29:33 -0700 Subject: [PATCH] ddsketch_utility: Add utility scripts for ddsketch (#7602) * Adds ddsketch_calc.py which implements a class for DDSketch related calculations * Adds ddsketch_conversion.py for quickly computing a bucket index to a value or vice-versa * Adds ddsketch_compare.py to compute how similar two ddsketch distributions are * Adds export_graph.py to graph the ddsketch distribution outputted from mako The arguments for ddsketch_conversion.py are: -b, --bucket: the bucket index that we need to calculate the value from (optional) -v, --value: the value that we need to calculate the bucket index from (optional) -e, --error_guarantee: the error guarantee for ddsketch (optional, default is 0.005) The arguments for ddsketch_compare.py are: --file1: Path to first ddsketch json --file2: Path to second ddsketch json --txn1: The transaction type for the first file --txn2: The transaction type for the second file --op: The operation name (ex: GRV, GET ...) The arguments for export_graph.py: --file: path to ddsketch distribution --txn, -t: Transaction type from file --title: title for graph (optional, otherwise "Title" is used) --savefig: Path to save the image plot (optional) --op: Which operation to plot --- contrib/ddsketch_calc.py | 72 ++++++++++++++++++++++++++++++++++ contrib/ddsketch_compare.py | 70 +++++++++++++++++++++++++++++++++ contrib/ddsketch_conversion.py | 45 +++++++++++++++++++++ contrib/export_graph.py | 67 +++++++++++++++++++++++++++++++ 4 files changed, 254 insertions(+) create mode 100644 contrib/ddsketch_calc.py create mode 100644 contrib/ddsketch_compare.py create mode 100644 contrib/ddsketch_conversion.py create mode 100644 contrib/export_graph.py diff --git a/contrib/ddsketch_calc.py b/contrib/ddsketch_calc.py new file mode 100644 index 0000000000..b113cb37dc --- /dev/null +++ b/contrib/ddsketch_calc.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# +# ddsketch_calc.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import math as m + + +# Implements a DDSketch class as desrcibed in: +# https://arxiv.org/pdf/1908.10693.pdf + +# This class has methods that use cubic interpolation to quickly compute log +# and inverse log. The coefficients A,B,C as well as correctingFactor are +# all constants used for interpolating. + +# The implementation for interpolation was originally seen here in: +# https://github.com/DataDog/sketches-java/ +# in the file CubicallyInterpolatedMapping.java + +class DDSketch(object): + A = 6.0 / 35.0 + B = -3.0 / 5.0 + C = 10.0 / 7.0 + EPS = 1e-18 + correctingFactor = 1.00988652862227438516 + offset = 0 + multiplier = 0 + gamma = 0 + + def __init__(self, errorGuarantee): + self.gamma = (1 + errorGuarantee) / (1 - errorGuarantee) + self.multiplier = (self.correctingFactor * m.log(2)) / m.log(self.gamma) + self.offset = self.getIndex(1.0 / self.EPS) + + def fastlog(self, value): + s = np.frexp(value) + e = s[1] + s = s[0] + s = s * 2 - 1 + return ((self.A * s + self.B) * s + self.C) * s + e - 1 + + def reverseLog(self, index): + exponent = m.floor(index) + d0 = self.B * self.B - 3 * self.A * self.C + d1 = 2 * self.B * self.B * self.B - 9 * self.A * self.B * self.C - 27 * self.A * self.A * (index - exponent) + p = np.cbrt((d1 - np.sqrt(d1 * d1 - 4 * d0 * d0 * d0)) / 2) + significandPlusOne = - (self.B + p + d0 / p) / (3 * self.A) + 1 + return np.ldexp(significandPlusOne / 2, exponent + 1) + + def getIndex(self, sample): + return m.ceil(self.fastlog(sample) * self.multiplier) + self.offset + + def getValue(self, idx): + return self.reverseLog((idx - self.offset) / self.multiplier) * 2.0 / (1 + self.gamma) + diff --git a/contrib/ddsketch_compare.py b/contrib/ddsketch_compare.py new file mode 100644 index 0000000000..d3b5f9942f --- /dev/null +++ b/contrib/ddsketch_compare.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# +# ddsketch_compare.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json +import numpy as np + + +# kullback-leibler divergence (or relative entropy) +def relative_entropy(p, q): + difference = 0.0 + for i in range(len(p)): + if p[i] != 0.0 and q[i] != 0.0: + difference += (p[i] * np.log2(p[i]/q[i])) + return difference + +# jensen-shannon divergence (or symmetric relative entropy) +def relative_entropy_symmetric(dd1, dd2): + # normalize p, q into distribution + sum1 = sum(dd1) + sum2 = sum(dd2) + + p = [dd1[i] / sum1 for i in range(len(dd1))] + q = [dd2[i] / sum2 for i in range(len(dd2))] + m = [0.5 * (p[i] + q[i]) for i in range(len(p))] + + return 0.5 * relative_entropy(p, m) + 0.5 * relative_entropy(q, m) + +# setup cmdline args +parser = argparse.ArgumentParser(description="Compares two DDSketch distributions") +parser.add_argument('--txn1', help='Transaction type for first file', required=True, type=str) +parser.add_argument('--txn2', help='Transaction type for second file', required=True, type=str) +parser.add_argument('--file1', help='Path to first ddsketch json', required=True, type=str) +parser.add_argument('--file2', help="Path to second ddsketch json'", required=True, type=str) +parser.add_argument("--op", help='Operation name', type=str) +args = parser.parse_args() + +f1 = open(args.file1) +f2 = open(args.file2) +data1 = json.load(f1) +data2 = json.load(f2) + +if data1[args.txn1][args.op]["errorGuarantee"] != data2[args.txn2][args.op]["errorGuarantee"]: + print("ERROR: The sketches have different error guarantees and cannot be compared!") + exit() + +b1 = data1[args.txn1][args.op]["buckets"] +b2 = data2[args.txn2][args.op]["buckets"] + +re = relative_entropy_symmetric(b1, b2) +print("The similarity is: ", round(re, 8)) +print("1 means least alike, 0 means most alike") \ No newline at end of file diff --git a/contrib/ddsketch_conversion.py b/contrib/ddsketch_conversion.py new file mode 100644 index 0000000000..5b9825f267 --- /dev/null +++ b/contrib/ddsketch_conversion.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# +# ddsketch_conversion.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import ddsketch_calc as dd + + +parser = argparse.ArgumentParser(description="Converts values to DDSketch buckets") +parser.add_argument('-e', '--error_guarantee', help='Error guarantee (default is 0.005)', required=False, type=float) +parser.add_argument('-v', '--value', help="Value", required=False, type=int) +parser.add_argument('-b', '--bucket', help='Bucket index', required=False, type=int) +args = parser.parse_args() + +error = 0.005 + +if args.error_guarantee is not None: + error = args.error_guarantee + +sketch = dd.DDSketch(error) + +if args.value is not None: + print("Bucket index for ", args.value) + print(sketch.getIndex(args.value)) + +if args.bucket is not None: + print("Value for bucket ", args.bucket) + print(sketch.getValue(args.bucket)) \ No newline at end of file diff --git a/contrib/export_graph.py b/contrib/export_graph.py new file mode 100644 index 0000000000..9cd369f7de --- /dev/null +++ b/contrib/export_graph.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# +# export_graph.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import matplotlib.pyplot as plt +import argparse +import ddsketch_calc as dd + +# setup cmdline args +parser = argparse.ArgumentParser(description="Graphs DDSketch distribution") +parser.add_argument('-t', '--txn', help='Transaction type (ex: g8ui)', required=True, type=str) +parser.add_argument('--file', help='Path to ddsketch json', required=True, type=str) +parser.add_argument('--title', help='Title for the graph', required=False, type=str) +parser.add_argument('--savefig', help='Will save the plot to a file if set', type=str) +parser.add_argument('--op', help='Which OP to plot (casing matters)', type=str) +args = parser.parse_args() + + +# Opening JSON file +f = open(args.file) +data = json.load(f) + +# parse json and init sketch +buckets = data[args.t][args.op]["buckets"] +error = data[args.t][args.op]["errorGuarantee"] +sketch = dd.DDSketch(error) + +# trim the tails of the distribution +ls = [i for i, e in enumerate(buckets) if e != 0] +actual_data = buckets[ls[0]:ls[-1]+1] +indices = range(ls[0], ls[-1]+1) +actual_indices = [sketch.getValue(i) for i in indices] + +# configure the x-axis to make more sense +fig, ax = plt.subplots() +ax.ticklabel_format(useOffset=False, style='plain') +plt.plot(actual_indices, actual_data) +plt.xlabel("Latency (in us)") +plt.ylabel("Frequency count") + +plt_title = "Title" +if args.title is not None: + plt_title = args.title +plt.title(plt_title) +plt.xlim([actual_indices[0], actual_indices[-1]]) +if args.savefig is not None: + plt.savefig(args.savefig, format='png') +else: + plt.show() \ No newline at end of file