From a1515aa47fc4bc4db84e125aa2a69954932a8a1d Mon Sep 17 00:00:00 2001
From: AlistairStewart <32751032+AlistairStewart@users.noreply.github.com>
Date: Fri, 21 Jul 2023 14:59:51 +0200
Subject: [PATCH] prettified randao analysis

---
 random/randao_analysis.ipynb | 177 ++++++++++++++---------------------
 1 file changed, 71 insertions(+), 106 deletions(-)

diff --git a/random/randao_analysis.ipynb b/random/randao_analysis.ipynb
index 515b33e..3f8a9e5 100644
--- a/random/randao_analysis.ipynb
+++ b/random/randao_analysis.ipynb
@@ -1,14 +1,37 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Randao Analysis using Markov Chains\n",
+    "\n",
+    "This Sagemath Jupyter notebook is for tha nalaysis of Randao randomness, under the assumption that honest block producers always produce blocks in their slot that always get into the chain and can never be reverted. This assumption is not necessarily reasonable.\n",
+    "\n",
+    "The randomness is sampled at a particular slot, e.g. the end of an epoch or just before it is used by a smart contract. The adversary cannot predict the randomness at the last slot up to this slot that has an honest block producer, because their contribution is random and unknown. However each adversarial blocm producer between this last honest slot and the sampled slot has a choice of whether to produce a block or not. Thus if the adversary controls m slots in a row up to the sampling slot, then they have 2^m choices for the randonness.\n",
+    "\n",
+    "The randomness sampled at the end of epoch n is used to determine the block producers in epoch n+2. We can imagine that the adversary wants to maximise the numnber of adversarial slots at the end of epoch n+2 to get control over the randonness in epoch n+4 etc. If they choose to do this, we can construct a Markov chain, where each state is the number of adversarial blocks at the end of this epoch. The next state is the number of adversarial blocks at the end of the epoch after next. (So odd and even numbered epochs are mostly indepedent.)\n",
+    "\n",
+    "If the adversray controls 1/3 of the validator set and controls m slots at the end of the current epoch then under this attack, the distribution of the number of slots they control at the end of the next epoch is the maximum of 2^m geometric distributions (the kind that start at 0) with parameter 2/3.\n",
+    "\n",
+    "The stationary distribution of this Markov chain is the distribution of the number of adversarial slots at the end of the peoch under continous attack where the adversary tries to maximise this.\n",
+    "\n",
+    "We want to consider sampling randomness 4 epochs after some trigger happens. Now an attacker could wasit until the current epoch has many adversarial validators at the end, before causing the trigger to happen. Then 4 epochs, later, the current epoch may still be somewhate biasable, allowing the adversary to have a more than usual chance to get many adversarial blocks before the trigger block.\n",
+    "\n",
+    "To analyse this, we first need a conservative estimate of how many slots at the end of the current  that adversary can feasibly wait to occur, under the coninuous attack or not. Then we can consider two transitions of the Markov chain from this event as being the distribution of the number of adversarial slots before the randomness is sampled. Now we can compute the expected number of options for the sampled randomness from this distribution."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# The cumulative distribution function and probability mass functiom\n",
+    "# at m for the maximum of t geometric distributions with parameter p\n",
     "def cdfmaxgeo(p,t,m):\n",
     "    return (1-(1-p)^(m+1))^t\n",
-    "def pdfmaxgeo(p,t,m):\n",
+    "def pmfmaxgeo(p,t,m):\n",
     "    return cdfmaxgeo(p,t,m)-cdfmaxgeo(p,t,m-1)\n",
     "    "
    ]
@@ -19,25 +42,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def nextstateprob(x,y):\n",
-    "    if x ==63:\n",
-    "        return 1-cdfmaxgeo(2/3+0.0,2^x,62)\n",
-    "    return pdfmaxgeo(2/3+0.0,2^x,y)"
+    "# Now we build the transition matrix.\n",
+    "# Each column is the maximum of 2^j geometric distributions with parameter 2/3.\n",
+    "# For the last row, corresponding to m=63, \n",
+    "# we take probability of being at least 63 so the probabilities add up to 1.\n",
+    "def nextstateprob(j,i):\n",
+    "    if i ==63:\n",
+    "        return 1-cdfmaxgeo(2/3+0.0,2^j,62)\n",
+    "    return pmfmaxgeo(2/3+0.0,2^j,i)\n",
+    "tm=matrix([[nextstateprob(j,i) for j in range(64)] for i in range(64)])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "tm=matrix([[nextstateprob(x,y) for x in range(64)] for y in range(64)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -52,43 +71,27 @@
        "(0.476563095838547, 0.290008277069172, 0.139917968194352, 0.0586770548078090, 0.0224471788438172, 0.00810421240432098, 0.00282576354047732, 0.000965620671507453, 0.000326252792938701, 0.000109544499335094, 0.0000366568819313766, 0.0000122441918654699, 4.08585723181877e-6, 1.36273835423470e-6, 4.54384351862505e-7, 1.51485734113910e-7, 5.04995072826173e-8, 1.68339168055658e-8, 5.61143674448838e-9, 1.87050189995125e-9, 6.23504803604958e-10, 2.07835570232040e-10, 6.92786571866131e-11, 2.30930253742276e-11, 7.69755676003532e-12, 2.56585251964193e-12, 8.55525570183441e-13, 2.84933826330784e-13, 9.52193095509939e-14, 3.14984028202337e-14, 1.04994676112577e-14, 3.62050607336612e-15, 1.08615182206194e-15, 3.62050607359248e-16, 3.62050607361851e-16, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000)"
       ]
      },
-     "execution_count": 80,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# Next we use the power method to approximate the stationary distribution stat\n",
     "pm=tm\n",
     "for _ in range(20):\n",
     " pm = pm^2\n",
     "stat=pm*vector({1:1,63:0})\n",
+    "# The probabilities should add up to 1. \n",
+    "# Too many iterations of the power method will blow up the rounding error, \n",
+    "# so it's better to check that it is not too far from 1.\n",
     "print(sum(stat))\n",
     "stat"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1.51485734113910e-7"
-      ]
-     },
-     "execution_count": 81,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "stat[15]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -97,59 +100,38 @@
        "(0.666666666666667, 0.222222222222222, 0.0740740740740741, 0.0246913580246914, 0.00823045267489719, 0.00274348422496573, 0.000914494741655170, 0.000304831580551723, 0.000101610526850648, 0.0000338701756168458, 0.0000112900585389486, 3.76335284635321e-6, 1.25445094878440e-6, 4.18150316261467e-7, 1.39383438679808e-7, 4.64611462636100e-8, 1.54870487545367e-8, 5.16234954783812e-9, 1.72078318261271e-9, 5.73594394204235e-10, 1.91198168408846e-10, 6.37326857955145e-11, 2.12442285985048e-11, 7.08144654026910e-12, 2.36044517265555e-12, 7.86815057551848e-13, 2.62345700718924e-13, 8.73745520379998e-14, 2.91988655476416e-14, 9.65894031423886e-15, 3.21964677141295e-15, 1.11022302462516e-15, 3.33066907387547e-16, 1.11022302462516e-16, 1.11022302462516e-16, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000, 0.000000000000000)"
       ]
      },
-     "execution_count": 47,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# unbiased is the distribution after one transition, which should be just geometric with parameter 2/3\n",
     "unbiased=tm*vector({0:1,63:0})\n",
     "unbiased"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "4.64611462636100e-8"
-      ]
-     },
-     "execution_count": 48,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4.64611462636100e-8 4.64611462636100e-8 1.51485734113910e-7\n"
+     ]
     }
    ],
    "source": [
-    "pdfmaxgeo(2/3+0.0,1,15)"
+    "# Just some sanity checking\n",
+    "print(pmfmaxgeo(2/3+0.0,1,15),unbiased[15],stat[15])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "4.64611462636100e-8"
-      ]
-     },
-     "execution_count": 49,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "unbiased[15]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -162,6 +144,7 @@
     }
    ],
    "source": [
+    "# Next we look at the expected number of options the adversary has under these distributions\n",
     "def expectedoptions(dist):\n",
     "    return sum([2^i*x for (i,x) in zip(range(64),dist)])\n",
     "def expectedtail(dist):\n",
@@ -172,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -181,19 +164,20 @@
        "1.21765601217656e-8"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# The number of epochs in a century\n",
     "epochsinacentury=(5*60*24*365*1000/32)\n",
     "1/epochsinacentury+0.0"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -210,6 +194,9 @@
     }
    ],
    "source": [
+    "#So what tail length can freasibly occur if the adversary waits long enough, \n",
+    "# both when they trying to macimise the tail length, which gives the distribution stat\n",
+    "# and when they just wait (i.e. using unbiased)\n",
     "print(\"tail length 15 occurs every\",100/(stat[15]*epochsinacentury),\"years in expectation under the stationary distribution\")\n",
     "print(\"tail length 15 occurs every\",100/(unbiased[15]*epochsinacentury),\"years in expectation under the single transition from unbiased distribution\")\n",
     "print(\"tail length 16 occurs every\",100/(stat[16]*epochsinacentury),\"years in expectation under the stationary distribution\")\n",
@@ -220,7 +207,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -229,18 +216,20 @@
        "172.837461872421"
       ]
      },
-     "execution_count": 85,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# If we do two transitions from 16\n",
+    "# corresponding to taking a sample 4 epochs after epoch that the adversary timed an atteck for\n",
     "expectedoptions(tm*tm*vector({16:1,63:0}))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -249,18 +238,19 @@
        "6.41125360736917"
       ]
      },
-     "execution_count": 86,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# and how long is the expected tail then?\n",
     "expectedtail(tm*tm*vector({16:1,63:0}))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -269,18 +259,19 @@
        "1901.01954391692"
       ]
      },
-     "execution_count": 65,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# What about if we ait 2 epochs instead of 4?\n",
     "expectedoptions(tm*vector({16:1,63:0}))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -289,41 +280,15 @@
        "36.3762992608834"
       ]
      },
-     "execution_count": 87,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# Or 6 epochs?\n",
     "expectedoptions(tm*tm*tm*vector({16:1,63:0}))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 46,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tm.transpose()[0]==tm*vector({0:1,63:0})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {