|
1 | 1 | # LLZK Language Specification {#syntax} |
2 | 2 |
|
3 | | -<!-- |
4 | | -TODO: Update this specification (https://github.com/project-llzk/llzk-lib/issues/262) |
5 | | -Based on: https://www.notion.so/veridise/ZK-IR-Design-5d4e6b675c9142e9b1583bdca3c8c8a6 |
6 | | ---> |
7 | | - |
8 | | -\todo This documentation is out of date and will be updated shortly. |
9 | | - |
10 | 3 | \tableofcontents |
11 | 4 |
|
12 | 5 | ## Syntax |
13 | 6 |
|
14 | | -```EBNF |
15 | | -
|
16 | | -program = program_header, { include | global_const | global_func | component } ; |
17 | | -program_header = "#dialect", identifier ; |
18 | | -include = "#include", ? path ? ; |
19 | | -global_const = identifier, ":=", const_expr, ";" , [ annotation ] ; |
20 | | -global_func = "function.def", identifier, parameters, "->", type, body ; |
21 | | -component = "struct", identifier, [ struct_params ], |
22 | | - "{", fields, funcs, "}", [ annotation ] ; |
23 | | -struct_params = "<", [ identifier, { "," , identifier }], ">" ; |
24 | | -fields = { identifier, ":", type, "," , [ annotation ] } ; |
25 | | -funcs = ( compute , constrain ) | ( constrain, compute ) ; |
26 | | -compute = "function.def compute", parameters, body ; |
27 | | -constrain = "function.def constrain", parameters, body ; |
28 | | -parameters = "(", [ param, { "," , param }], ")" ; |
29 | | -param = identifier, ":", type ; |
30 | | -body = "{", { statement, ";", [ annotation ] }, "}" ; |
31 | | -statement = emit | return | while | if | assign | expr_call ; |
32 | | -emit = "emit", ( expr_contains | ( expr, "=", expr ) ) ; |
33 | | -return = "return", expr ; |
34 | | -while = "while", expr , body ; |
35 | | -if = "if", expr, body, [ "else", body ] ; |
36 | | -assign = lvalue, ":=", expr ; |
37 | | -const_expr = expr ; (* semantics require const type *) |
38 | | -expr = literal | lvalue | expr_array | expr_call | expr_ref | |
39 | | - expr_contains | expr_binop | expr_monop | expr_special ; |
40 | | -expr_array = "[", list_of_expr, "]" ; |
41 | | -expr_call = [ expr_call_base ], expr_call_target ; |
42 | | -expr_call_base = lvalue, [ "<", const_expr, ">" ], "." ; |
43 | | -expr_call_target = identifier, "(", list_of_expr, ")" ; |
44 | | -expr_ref = lvalue, ".", identifier ; |
45 | | -expr_contains = lvalue, "in", lvalue ; |
46 | | -expr_binop = expr, binop, expr ; |
47 | | -list_of_expr = [ expr, { "," , expr }] ; |
48 | | -binop = binop_arith | binop_bit | binop_compare | binop_logic ; |
49 | | -binop_arith = "+" | "-" | "*" | "/" | "%" ; |
50 | | -binop_bit = "&" | "|" | "^" | "<<" | ">>" ; |
51 | | -binop_compare = ">" | "<" | ">=" | "<=" | "==" | "!=" ; |
52 | | -binop_logic = "&&" | "||" ; |
53 | | -expr_monop = ( "(", expr, ")" ) | ( monop, expr ) ; |
54 | | -monop = monop_arith | monop_bit | monop_logic ; |
55 | | -monop_arith = "-" ; |
56 | | -monop_bit = "~" ; |
57 | | -monop_logic = "!" ; |
58 | | -expr_special = "nondetFelt()"; |
59 | | -lvalue = identifier | lvalue_array ; |
60 | | -lvalue_array = lvalue, "[", lvalue, "]" ; |
61 | | -type = { type_modifier }, type_base ; |
62 | | -type_modifier = "pub" | "const" ; |
63 | | -type_base = identifier | ( type, "[", const_expr, "]" ) ; |
64 | | -constant = literal | identifier ; |
65 | | -identifier = letter, { letter | digit } ; (* and symbols if needed *) |
66 | | -annotation = "#", ? anything except newline ? ; |
67 | | -letter = ? all capital and lowercase characters ? ; |
68 | | -literal = ( digit, { digit } ) | ( "0x", hexit, { hexit } ) ; |
69 | | -hexit = digit | "A" | "B" | "C" | "D" | "E" | "F" |
70 | | - | "a" | "b" | "c" | "d" | "e" | "f" ; |
71 | | -digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; |
72 | | -
|
| 7 | +The root `module` in LLZK IR must have the `llzk.lang` attribute with an optional string that is typically used to indicate the source language. The root `module` can contain any number of `struct.def`, `function.def`, or other `module` ops. The `struct.def` op is the foundation of LLZK IR and is used to describe each component in a circuit. It can contain any number of data members, a `compute()` function that holds the witness generation code, and a `constrain()` function that holds the constraint generation code. No other functions may appear within a `struct.def`. |
| 8 | + |
| 9 | +Here is a simple example of LLZK IR translated from the circomlib [and gate](\ref circomlib-and-gate): |
| 10 | + |
| 11 | +```mlir |
| 12 | +module attributes {llzk.lang = "circom"} { |
| 13 | + struct.def @AND { |
| 14 | + struct.member @out : !felt.type {llzk.pub} |
| 15 | + function.def @compute(%a: !felt.type, %b: !felt.type) -> !struct.type<@AND> { |
| 16 | + %self = struct.new : !struct.type<@AND> |
| 17 | + %0 = felt.mul %a, %b : !felt.type, !felt.type |
| 18 | + struct.writem %self[@out] = %0 : !struct.type<@AND>, !felt.type |
| 19 | + function.return %self : !struct.type<@AND> |
| 20 | + } |
| 21 | + function.def @constrain(%self: !struct.type<@AND>, %a: !felt.type, %b: !felt.type) { |
| 22 | + %0 = struct.readm %self[@out] : !struct.type<@AND>, !felt.type |
| 23 | + %1 = felt.mul %a, %b : !felt.type, !felt.type |
| 24 | + constrain.eq %0, %1 : !felt.type, !felt.type |
| 25 | + function.return |
| 26 | + } |
| 27 | + } |
| 28 | +} |
73 | 29 | ``` |
74 | 30 |
|
75 | 31 | ## Types |
76 | 32 |
|
77 | | -- `bool`: subtype of Felt in [0,1] |
78 | | -- `int`: machine integer |
79 | | -- `Felt`: finite field element |
80 | | -- `struct` (component): Aggregate type with named heterogeneous elements. Generally correlates to components/functions in the source language. Constituent elements may be local variables, subcomponents, and/or called functions. |
81 | | -- `array<E>`: elements can be any type, including other array type for multi-dimensional arrays. We may need this type to have a built-in field named `len` that returns the length of the array. |
82 | | -- `const<T>`: **modifier** on types to denote it’s a compile-time constant. Semantic analysis can infer `const` based on usage of literal values, etc. but it can also be specified in the IR in which case the semantic analysis must ensure it’s correct or give an error. The semantics of several syntax nodes require a `const` value, such as the `i` in `GetWeight<i>.compute()`. Global function return type can be `const` and that would allow such a function to be used in these locations. |
| 33 | +- `i1`: (MLIR builtin) Boolean value [0,1]. |
| 34 | +- `index`: (MLIR builtin) Machine integer. |
| 35 | +- `felt.type`: Finite field element. |
| 36 | +- `array.type<N x E>`: Aggregate type with indexed [pseudo-homogeneous](\ref pseudo-homogeneous) elements. Element type cannot be another array type, instead multi-dimensional arrays are specified with a comma-separated list of dimension sizes. Each dimension size can be specified as an integer literal, a symbol (referring to a template parameter within a templated `struct.def`), or an [affine_map](https://mlir.llvm.org/docs/Dialects/Affine/#polyhedral-structures) (used when creating arrays within a loop where the dimension size depends on the loop iteration variable). |
| 37 | +- `struct.type<[..]>`: Aggregate type with named heterogeneous elements corresponding to a `struct.def`. Generally correlates to components/functions in the source language. Constituent elements may be local variables, subcomponents, and/or called functions. Optionally includes a list of parameters to instantiate a templated `struct.def` where each parameter can be an integer literal, a symbol (referring to a template parameter within a templated `struct.def`), a type used to instantiate a `poly.tvar<@N>` (see below), or an [affine_map](https://mlir.llvm.org/docs/Dialects/Affine/#polyhedral-structures) (used when the parameter of a templated `struct.type` depends on a loop iteration variable). |
| 38 | +- `pod.type<..>`: Plain Old Data aggregate type with named heterogeneous elements. Unlike `struct.type`, there is no associated named declaration, the type itself specifies all constituent element types. It can be used more freely than `struct.type` since it has fewer restrictions on modifications. |
| 39 | +- `poly.tvar<@N>`: Placeholder type variable within a templated `struct.def` that may be instantiated with different types. |
| 40 | +- `string.type`: Sequence of characters. |
83 | 41 |
|
84 | | -## Special Constructs |
| 42 | +### Pseudo-homogeneous arrays {#pseudo-homogeneous} |
85 | 43 |
|
86 | | -- `nondetFelt()`: can be used as the parameter of a `constrain()` function when the expression from the source language can be elided because it cannot be used as part of a constraint. For example, expressions containing bitwise operators cannot be part of a constraint. |
87 | | -- A special element can be added to a `struct` to store the return value of a component (i.e., `synthetic_return` in the examples above). |
| 44 | +LLZK supports arrays where the element type is not truly homogeneous, specifically when a templated `struct.type` is used with an `affine_map` parameter. For example, the type `!array.type<10 x !struct.type<@X<[affine_map<(i)[] -> (i*5)>]>>>` contains instances of the struct `@X` instantiated with different parameter values per `affine_map<(i)[] -> (i*5)>`. Use of this type can be seen in [circom_example_2.llzk](\ref test/FrontendLang/Circom/circom_example_2.llzk). If the circuit is ultimately instantiated and flattened, the array will have to be split into scalar values since the instantiated struct type of each element is different. |
88 | 45 |
|
89 | 46 | ## Semantic Rules |
90 | 47 |
|
91 | | -- `emit` must only appear in `constrain()` functions and `return` only in global functions. |
92 | | -- `constrain()` only calls `constrain()` and `compute()` only calls `compute()`. Either can call arbitrary functions but not each other. |
93 | | -- Function parameters are pass-by-value. |
94 | | -- For references like `a.b`, a field named `b` must be present in the component `a`. For references like `c`, if a field named `c` is present in the current component the reference refers to that field instance, otherwise it refers to a local named `c` within that function, with type inferred from the RHS of the expression where `c` is defined. |
95 | | -- Global constants cannot be modified/assigned. |
| 48 | +- Ops marked with the `WitnessGen` trait can only be used in functions with the `allow_witness` attribute (`compute()` within `struct.def` has this by default). Similarly, ops marked with the `ConstraintGen` trait can only be used in functions with the `allow_constraint` attribute (`constrain()` within `struct.def` has this by default). |
| 49 | +- Functions with the `allow_witness` attribute can only call other functions marked with `allow_witness`. Likewise for `allow_constraint`. |
| 50 | +- Ops marked with the `NotFieldNative` trait can only be used in functions with the `allow_non_native_field_ops` attribute. Some of these ops have known transformations to field-native operations but others do not. It is up to backend users to determine how to handle such ops appearing in `constrain()` functions (one possibility being replacing these ops with `llzk.nondet`) |
96 | 51 |
|
97 | 52 | ## Translation Guidelines {#translation-guidelines} |
98 | 53 |
|
99 | | -- The modifier `pub` can be added before the type on `compute()` and `constrain()` parameters to denote which elements of the domain are public (default is private). Likewise, it can be used on struct fields to denote which fields are part of the co-domain (i.e., public outputs). |
100 | | -- The frontend translation for each source language to ZK IR should be as simple as possible since this will be repeated effort for each source language. Any transformations or optimizations on the ZK IR should be done in a shared module run on the internal representation after translation. |
101 | | -- Constant integer parameters on a `struct` can be used to avoid creating multiple versions of that `struct` in the IR. A later pass can flatten the IR if needed by the client analysis. This is even where multiple dialects of the IR could be used, with a flattened dialect disallowing these parameters. |
102 | | -- If loop bounds are known, `scf.for` should be used to make loop bounds explicit. However, `scf.while` is available to handle the general case if that information is not available but this should not be used in the `constrain()` function. |
103 | | -- Global functions (i.e., user-defined or helper functions, located outside of `struct` definitions) are pure. There is no global state and parameters are pass-by-value (i.e., a copy is created) so there is nothing external they can modify. |
104 | | -- Source line information should be handled via MLIR so frontend components must provide that information when building the MLIR AST. |
| 54 | +- The frontend translation for each source language to LLZK IR should be as simple as possible since this will be repeated effort for each source language. To expand support of frontend languages, we welcome proposals of new high-level syntax along with a translation of that syntax to existing LLZK syntax. |
| 55 | +- To promote reusable infrastructure, transformations or optimizations should be performed on the LLZK IR rather than the source language, whenever possible. We welcome PRs to LLZK-lib for reusable passes. |
| 56 | +- Loops can be represented with either `scf.for` or `scf.while` and the optional `llzk.loopbounds` attribute can be added to specify known iteration information. |
| 57 | +- Frontend translations should attach accurate source line information to operations via the `Location` whenever possible. |
105 | 58 | - Only the outermost module should have the `llzk.lang` attribute (because the presence of that attribute is used to determine the “root” symbol table for symbol resolution). |
106 | 59 | - All inner modules must be named because their names are used to build the fully-qualified path names for symbol references. |
107 | 60 | - All references to functions and types must use fully-qualified paths. |
| 61 | + |
| 62 | +[circomlib-and-gate]: https://github.com/iden3/circomlib/blob/master/circuits/gates.circom#L29-L35 |
0 commit comments