From 058276493fc7e3609b4d9fa65e04ffe7da221b7e Mon Sep 17 00:00:00 2001
From: Kevin Hoxha <kevin.hoxha@snowflake.com>
Date: Mon, 25 Jul 2022 10:29:33 -0700
Subject: [PATCH] ddsketch_utility: Add utility scripts for ddsketch (#7602)

* Adds ddsketch_calc.py which implements a class for DDSketch related calculations
* Adds ddsketch_conversion.py for quickly computing a bucket index to a value or vice-versa
* Adds ddsketch_compare.py to compute how similar two ddsketch distributions are
* Adds export_graph.py to graph the ddsketch distribution outputted from mako

The arguments for ddsketch_conversion.py are:
-b, --bucket: the bucket index that we need to calculate the value from (optional)
-v, --value: the value that we need to calculate the bucket index from (optional)
-e, --error_guarantee: the error guarantee for ddsketch (optional, default is 0.005)

The arguments for ddsketch_compare.py are:
--file1: Path to first ddsketch json
--file2: Path to second ddsketch json
--txn1: The transaction type for the first file
--txn2: The transaction type for the second file
--op: The operation name (ex: GRV, GET ...)

The arguments for export_graph.py:
--file: path to ddsketch distribution
--txn, -t: Transaction type from file
--title: title for graph (optional, otherwise "Title" is used)
--savefig: Path to save the image plot (optional)
--op: Which operation to plot
---
 contrib/ddsketch_calc.py       | 72 ++++++++++++++++++++++++++++++++++
 contrib/ddsketch_compare.py    | 70 +++++++++++++++++++++++++++++++++
 contrib/ddsketch_conversion.py | 45 +++++++++++++++++++++
 contrib/export_graph.py        | 67 +++++++++++++++++++++++++++++++
 4 files changed, 254 insertions(+)
 create mode 100644 contrib/ddsketch_calc.py
 create mode 100644 contrib/ddsketch_compare.py
 create mode 100644 contrib/ddsketch_conversion.py
 create mode 100644 contrib/export_graph.py

diff --git a/contrib/ddsketch_calc.py b/contrib/ddsketch_calc.py
new file mode 100644
index 0000000000..b113cb37dc
--- /dev/null
+++ b/contrib/ddsketch_calc.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+#
+# ddsketch_calc.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import math as m
+
+
+# Implements a DDSketch class as desrcibed in:
+# https://arxiv.org/pdf/1908.10693.pdf
+
+# This class has methods that use cubic interpolation to quickly compute log
+# and inverse log. The coefficients A,B,C as well as correctingFactor are
+# all constants used for interpolating. 
+
+# The implementation for interpolation was originally seen here in:
+# https://github.com/DataDog/sketches-java/
+# in the file CubicallyInterpolatedMapping.java
+
+class DDSketch(object):
+    A = 6.0 / 35.0
+    B = -3.0 / 5.0
+    C = 10.0 / 7.0
+    EPS = 1e-18
+    correctingFactor = 1.00988652862227438516
+    offset = 0
+    multiplier = 0
+    gamma = 0
+
+    def __init__(self, errorGuarantee):
+        self.gamma = (1 + errorGuarantee) / (1 - errorGuarantee)
+        self.multiplier = (self.correctingFactor * m.log(2)) / m.log(self.gamma)
+        self.offset = self.getIndex(1.0 / self.EPS)
+
+    def fastlog(self, value):
+        s = np.frexp(value)
+        e = s[1]
+        s = s[0]
+        s = s * 2 - 1
+        return ((self.A * s + self.B) * s + self.C) * s + e - 1
+
+    def reverseLog(self, index):
+        exponent = m.floor(index)
+        d0 = self.B * self.B - 3 * self.A * self.C
+        d1 = 2 * self.B * self.B * self.B - 9 * self.A * self.B * self.C - 27 * self.A * self.A * (index - exponent)
+        p = np.cbrt((d1 - np.sqrt(d1 * d1 - 4 * d0 * d0 * d0)) / 2)
+        significandPlusOne = - (self.B + p + d0 / p) / (3 * self.A) + 1
+        return np.ldexp(significandPlusOne / 2, exponent + 1)
+
+    def getIndex(self, sample):
+        return m.ceil(self.fastlog(sample) * self.multiplier) + self.offset
+
+    def getValue(self, idx):
+        return self.reverseLog((idx - self.offset) / self.multiplier) * 2.0 / (1 + self.gamma)
+
diff --git a/contrib/ddsketch_compare.py b/contrib/ddsketch_compare.py
new file mode 100644
index 0000000000..d3b5f9942f
--- /dev/null
+++ b/contrib/ddsketch_compare.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+#
+# ddsketch_compare.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import json
+import numpy as np
+
+
+# kullback-leibler divergence (or relative entropy)
+def relative_entropy(p, q):
+    difference = 0.0
+    for i in range(len(p)):
+        if p[i] != 0.0 and q[i] != 0.0:
+            difference += (p[i] * np.log2(p[i]/q[i]))
+    return difference
+
+# jensen-shannon divergence (or symmetric relative entropy)
+def relative_entropy_symmetric(dd1, dd2):
+    # normalize p, q into distribution
+    sum1 = sum(dd1)
+    sum2 = sum(dd2)
+
+    p = [dd1[i] / sum1 for i in range(len(dd1))]
+    q = [dd2[i] / sum2 for i in range(len(dd2))]
+    m = [0.5 * (p[i] + q[i]) for i in range(len(p))]
+
+    return 0.5 * relative_entropy(p, m) + 0.5 * relative_entropy(q, m)
+
+# setup cmdline args
+parser = argparse.ArgumentParser(description="Compares two DDSketch distributions")
+parser.add_argument('--txn1', help='Transaction type for first file', required=True, type=str)
+parser.add_argument('--txn2', help='Transaction type for second file', required=True, type=str)
+parser.add_argument('--file1', help='Path to first ddsketch json', required=True, type=str)
+parser.add_argument('--file2', help="Path to second ddsketch json'", required=True, type=str)
+parser.add_argument("--op", help='Operation name', type=str)
+args = parser.parse_args()
+
+f1 = open(args.file1)
+f2 = open(args.file2)
+data1 = json.load(f1)
+data2 = json.load(f2)
+
+if data1[args.txn1][args.op]["errorGuarantee"] != data2[args.txn2][args.op]["errorGuarantee"]:
+    print("ERROR: The sketches have different error guarantees and cannot be compared!")
+    exit()
+
+b1 = data1[args.txn1][args.op]["buckets"]
+b2 = data2[args.txn2][args.op]["buckets"]
+
+re = relative_entropy_symmetric(b1, b2)
+print("The similarity is: ", round(re, 8))
+print("1 means least alike, 0 means most alike")
\ No newline at end of file
diff --git a/contrib/ddsketch_conversion.py b/contrib/ddsketch_conversion.py
new file mode 100644
index 0000000000..5b9825f267
--- /dev/null
+++ b/contrib/ddsketch_conversion.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+#
+# ddsketch_conversion.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import ddsketch_calc as dd
+
+
+parser = argparse.ArgumentParser(description="Converts values to DDSketch buckets")
+parser.add_argument('-e', '--error_guarantee', help='Error guarantee (default is 0.005)', required=False, type=float)
+parser.add_argument('-v', '--value', help="Value", required=False, type=int)
+parser.add_argument('-b', '--bucket', help='Bucket index', required=False, type=int)
+args = parser.parse_args()
+
+error = 0.005
+
+if args.error_guarantee is not None:
+    error = args.error_guarantee
+
+sketch = dd.DDSketch(error)
+
+if args.value is not None:
+    print("Bucket index for ", args.value)
+    print(sketch.getIndex(args.value))
+
+if args.bucket is not None:
+    print("Value for bucket ", args.bucket)
+    print(sketch.getValue(args.bucket))
\ No newline at end of file
diff --git a/contrib/export_graph.py b/contrib/export_graph.py
new file mode 100644
index 0000000000..9cd369f7de
--- /dev/null
+++ b/contrib/export_graph.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+#
+# export_graph.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import json
+import matplotlib.pyplot as plt
+import argparse
+import ddsketch_calc as dd
+
+# setup cmdline args
+parser = argparse.ArgumentParser(description="Graphs DDSketch distribution")
+parser.add_argument('-t', '--txn', help='Transaction type (ex: g8ui)', required=True, type=str)
+parser.add_argument('--file', help='Path to ddsketch json', required=True, type=str)
+parser.add_argument('--title', help='Title for the graph', required=False, type=str)
+parser.add_argument('--savefig', help='Will save the plot to a file if set', type=str)
+parser.add_argument('--op', help='Which OP to plot (casing matters)', type=str)
+args = parser.parse_args()
+
+
+# Opening JSON file
+f = open(args.file)
+data = json.load(f)
+
+# parse json and init sketch
+buckets = data[args.t][args.op]["buckets"]
+error = data[args.t][args.op]["errorGuarantee"]
+sketch = dd.DDSketch(error)
+
+# trim the tails of the distribution
+ls = [i for i, e in enumerate(buckets) if e != 0]
+actual_data = buckets[ls[0]:ls[-1]+1]
+indices = range(ls[0], ls[-1]+1)
+actual_indices = [sketch.getValue(i) for i in indices]
+
+# configure the x-axis to make more sense
+fig, ax = plt.subplots()
+ax.ticklabel_format(useOffset=False, style='plain')
+plt.plot(actual_indices, actual_data)
+plt.xlabel("Latency (in us)")
+plt.ylabel("Frequency count")
+
+plt_title = "Title"
+if args.title is not None:
+    plt_title = args.title
+plt.title(plt_title)
+plt.xlim([actual_indices[0], actual_indices[-1]])
+if args.savefig is not None:
+    plt.savefig(args.savefig, format='png')
+else:   
+    plt.show()
\ No newline at end of file