Benchmark the production code rather than some arbitrary thing (#5200)

effectfully · michaelpj · web-flow · commit 9ab93f4ca077 · 2023-03-13T11:24:25.000Z
* Benchmark the production code rather than some arbitrary thing

* Apply suggestions from code review

Co-authored-by: Michael Peyton Jones &lt;michael.peyton-jones@iohk.io&gt;

* Address comments

---------

Co-authored-by: Michael Peyton Jones &lt;michael.peyton-jones@iohk.io&gt;
diff --git a/plutus-benchmark/changelog.d/20230310_185153_effectfully_stop_the_irrelevance.rst b/plutus-benchmark/changelog.d/20230310_185153_effectfully_stop_the_irrelevance.rst
@@ -0,0 +1,4 @@
+Fixed
+-----
+
+- Made the `validation` benchmarks use the actual production evaluator (#5200)
diff --git a/plutus-benchmark/plutus-benchmark.cabal b/plutus-benchmark/plutus-benchmark.cabal
@@ -271,6 +271,8 @@ benchmark validation
     , optparse-applicative
     , plutus-benchmark-common
     , plutus-core              ^>=1.3
+    , plutus-ledger-api        ^>=1.3
+    , transformers
 
 ---------------- validation-decode ----------------
 
@@ -291,6 +293,7 @@ benchmark validation-decode
     , plutus-benchmark-common
     , plutus-core              ^>=1.3
     , plutus-ledger-api        ^>=1.3
+    , transformers
 
 ---------------- validation-full ----------------
 
@@ -311,6 +314,7 @@ benchmark validation-full
     , plutus-benchmark-common
     , plutus-core                                                       ^>=1.3
     , plutus-ledger-api:{plutus-ledger-api, plutus-ledger-api-testlib}  ^>=1.3
+    , transformers
 
 ---------------- Cek cost model calibration ----------------
 
diff --git a/plutus-benchmark/validation/BenchCek.hs b/plutus-benchmark/validation/BenchCek.hs
@@ -3,6 +3,7 @@ module Main where
 
 import Common
 import Control.DeepSeq (force)
+import Control.Exception
 import Criterion
 import PlutusBenchmark.Common
 import UntypedPlutusCore as UPLC
@@ -16,11 +17,10 @@ import UntypedPlutusCore as UPLC
      `cabal bench -- plutus-benchmark:validation --benchmark-options crowdfunding`.
 -}
 main :: IO ()
-main = benchWith mkCekBM
+main = evaluate (force getEvalCtx) *> benchWith mkCekBM
  where
    mkCekBM file program =
        -- don't count the undebruijn . unflat cost
        -- `force` to try to ensure that deserialiation is not included in benchmarking time.
        let !nterm = force (toNamedDeBruijnTerm $ UPLC._progTerm $ unsafeUnflat file program)
-       in whnf unsafeEvaluateCekNoEmit' nterm
-
+       in whnf evaluateCekLikeInProd nterm
diff --git a/plutus-benchmark/validation/Common.hs b/plutus-benchmark/validation/Common.hs
@@ -3,19 +3,25 @@
 module Common (
     benchWith
     , unsafeUnflat
-    , unsafeEvaluateCekNoEmit'
+    , getEvalCtx
+    , evaluateCekLikeInProd
     , peelDataArguments
     , Term
     ) where
 
+import PlutusPrelude
+
 import PlutusBenchmark.Common (getConfig, getDataDir)
 import PlutusBenchmark.NaturalSort
 
 import PlutusCore qualified as PLC
 import PlutusCore.Builtin qualified as PLC
 import PlutusCore.Data qualified as PLC
 import PlutusCore.Evaluation.Machine.ExBudgetingDefaults qualified as PLC
-import PlutusCore.Evaluation.Machine.Exception
+import PlutusCore.Evaluation.Result
+import PlutusLedgerApi.Common (LedgerPlutusVersion (PlutusV1), evaluateTerm)
+import PlutusLedgerApi.Common.Versions (languageIntroducedIn)
+import PlutusLedgerApi.V3 (EvaluationContext, ParamName, VerboseMode (..), mkEvaluationContext)
 import UntypedPlutusCore qualified as UPLC
 import UntypedPlutusCore.Evaluation.Machine.Cek qualified as UPLC
 
@@ -24,6 +30,8 @@ import Criterion.Main.Options (Mode, parseWith)
 import Criterion.Types (Config (..))
 import Options.Applicative
 
+import Control.Monad.Trans.Except
+import Control.Monad.Trans.Writer.Strict
 import Data.ByteString qualified as BS
 import Data.List (isPrefixOf)
 import Flat
@@ -128,13 +136,32 @@ benchWith act = do
         env (BS.readFile $ dir </> file) $ \scriptBS ->
             bench (dropExtension file) $ act file scriptBS
 
-unsafeEvaluateCekNoEmit' :: UPLC.Term PLC.NamedDeBruijn PLC.DefaultUni PLC.DefaultFun () -> PLC.EvaluationResult  (UPLC.Term PLC.NamedDeBruijn PLC.DefaultUni PLC.DefaultFun ())
-unsafeEvaluateCekNoEmit' =
-       (\(e, _, _) -> unsafeExtractEvaluationResult e) .
-            UPLC.runCekDeBruijn
-                PLC.defaultCekParameters
-                UPLC.restrictingEnormous
-                UPLC.noEmitter
+getEvalCtx
+    :: Either
+            (UPLC.CekEvaluationException UPLC.NamedDeBruijn UPLC.DefaultUni UPLC.DefaultFun)
+            EvaluationContext
+getEvalCtx = do
+    costParams <-
+        maybe
+            (Left evaluationFailure)
+            (Right . take (length $ enumerate @ParamName) . toList)
+            PLC.defaultCostModelParams
+    either (const $ Left evaluationFailure) (Right . fst) . runExcept . runWriterT $
+        mkEvaluationContext costParams
+{-# NOINLINE getEvalCtx #-}
+
+-- | Evaluate a term as it would be evaluated using the on-chain evaluator.
+evaluateCekLikeInProd
+    :: UPLC.Term PLC.NamedDeBruijn PLC.DefaultUni PLC.DefaultFun ()
+    -> Either
+            (UPLC.CekEvaluationException UPLC.NamedDeBruijn UPLC.DefaultUni UPLC.DefaultFun)
+            (UPLC.Term UPLC.NamedDeBruijn UPLC.DefaultUni UPLC.DefaultFun ())
+evaluateCekLikeInProd term = do
+    evalCtx <- getEvalCtx
+    let (getRes, _, _) =
+            -- The validation benchmarks were all created from PlutusV1 scripts
+            evaluateTerm UPLC.restrictingEnormous (languageIntroducedIn PlutusV1) Quiet evalCtx term
+    getRes
 
 type Term = UPLC.Term UPLC.DeBruijn UPLC.DefaultUni UPLC.DefaultFun ()
 
diff --git a/plutus-ledger-api/src/PlutusLedgerApi/Common.hs b/plutus-ledger-api/src/PlutusLedgerApi/Common.hs
@@ -11,6 +11,7 @@ module PlutusLedgerApi.Common
       -- * Script evaluation
     , evaluateScriptCounting
     , evaluateScriptRestricting
+    , evaluateTerm
     , VerboseMode (..)
     , LogOutput
     , EvaluationError (..)
diff --git a/plutus-ledger-api/src/PlutusLedgerApi/Common/Eval.hs b/plutus-ledger-api/src/PlutusLedgerApi/Common/Eval.hs
@@ -13,6 +13,7 @@ module PlutusLedgerApi.Common.Eval
     , VerboseMode (..)
     , evaluateScriptRestricting
     , evaluateScriptCounting
+    , evaluateTerm
     , mkDynEvaluationContext
     , toMachineParameters
     , mkTermToEvaluate
@@ -133,6 +134,28 @@ mkDynEvaluationContext ver newCMP =
 assertWellFormedCostModelParams :: MonadError CostModelApplyError m => Plutus.CostModelParams -> m ()
 assertWellFormedCostModelParams = void . Plutus.applyCostModelParams Plutus.defaultCekCostModel
 
+-- | Evaluate a fully-applied term using the CEK machine. Useful for mimicking the behaviour of the
+-- on-chain evaluator.
+evaluateTerm
+    :: UPLC.ExBudgetMode cost DefaultUni DefaultFun
+    -> ProtocolVersion
+    -> VerboseMode
+    -> EvaluationContext
+    -> UPLC.Term UPLC.NamedDeBruijn DefaultUni DefaultFun ()
+    -> ( Either
+            (UPLC.CekEvaluationException NamedDeBruijn DefaultUni DefaultFun)
+            (UPLC.Term UPLC.NamedDeBruijn DefaultUni DefaultFun ())
+       , cost
+       , [Text]
+       )
+evaluateTerm budgetMode pv verbose ectx =
+    UPLC.runCekDeBruijn
+        (toMachineParameters pv ectx)
+        budgetMode
+        (if verbose == Verbose then UPLC.logEmitter else UPLC.noEmitter)
+-- Just replicating the old behavior, probably doesn't matter.
+{-# INLINE evaluateTerm #-}
+
 {-| Evaluates a script, with a cost model and a budget that restricts how many
 resources it can use according to the cost model. Also returns the budget that
 was actually used.
@@ -155,14 +178,8 @@ evaluateScriptRestricting
     -> (LogOutput, Either EvaluationError ExBudget)
 evaluateScriptRestricting lv pv verbose ectx budget p args = swap $ runWriter @LogOutput $ runExceptT $ do
     appliedTerm <- mkTermToEvaluate lv pv p args
-
     let (res, UPLC.RestrictingSt (ExRestrictingBudget final), logs) =
-            UPLC.runCekDeBruijn
-                (toMachineParameters pv ectx)
-                (UPLC.restricting $ ExRestrictingBudget budget)
-                (if verbose == Verbose then UPLC.logEmitter else UPLC.noEmitter)
-                appliedTerm
-
+            evaluateTerm (UPLC.restricting $ ExRestrictingBudget budget) pv verbose ectx appliedTerm
     tell logs
     liftEither $ first CekError $ void res
     pure (budget `minusExBudget` final)
@@ -184,14 +201,8 @@ evaluateScriptCounting
     -> (LogOutput, Either EvaluationError ExBudget)
 evaluateScriptCounting lv pv verbose ectx p args = swap $ runWriter @LogOutput $ runExceptT $ do
     appliedTerm <- mkTermToEvaluate lv pv p args
-
     let (res, UPLC.CountingSt final, logs) =
-            UPLC.runCekDeBruijn
-                (toMachineParameters pv ectx)
-                UPLC.counting
-                (if verbose == Verbose then UPLC.logEmitter else UPLC.noEmitter)
-                appliedTerm
-
+            evaluateTerm UPLC.counting pv verbose ectx appliedTerm
     tell logs
     liftEither $ first CekError $ void res
     pure final

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +Fixed
 +-----
++
 +- Made the `validation` benchmarks use the actual production evaluator (#5200)