Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions recipes/mojmelo/recipe.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
context:
version: "0.0.9"
version: "0.1.0"

package:
name: "mojmelo"
version: ${{ version }}

source:
- git: https://github.com/yetalit/mojmelo.git
rev: dd4c87ee4d28d242ce57006182b248c6f95ad37a
rev: 01692d2078e55f4dbeac981240edc620d0dc96af

build:
number: 0
Expand All @@ -16,7 +16,7 @@ build:
- mojo package pixi/mojmelo -o ${{ PREFIX }}/lib/mojo/mojmelo.mojopkg
requirements:
host:
- mojo-compiler =0.25.6
- mojo-compiler =0.25.7
run:
- ${{ pin_compatible('mojo-compiler') }}

Expand All @@ -27,7 +27,7 @@ tests:
- mojo tests/setup.mojo
requirements:
run:
- mojo-compiler =0.25.6
- mojo-compiler =0.25.7
files:
recipe:
- tests/setup.mojo
Expand Down
43 changes: 32 additions & 11 deletions recipes/mojmelo/tests/mojmelo/utils/Matrix.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -6,36 +6,40 @@ struct Matrix(Copyable, Movable, ImplicitlyCopyable, Sized):
var height: Int
var width: Int
var size: Int
var data: UnsafePointer[Float32]
var data: UnsafePointer[Float32, MutAnyOrigin]
var order: String

# initialize from UnsafePointer
@always_inline
fn __init__(out self, data: UnsafePointer[Float32], height: Int, width: Int, order: String = 'c'):
fn __init__[src: DType = DType.float32](out self, data: UnsafePointer[Scalar[src], MutAnyOrigin], height: Int, width: Int, order: String = 'c'):
self.height = height
self.width = width
self.size = height * width
self.data = data
if src == DType.float32:
self.data = data.bitcast[Float32]()
else:
self.data = cast[src=src, des=DType.float32, width=self.simd_width](data, self.size)
data.free()
self.order = order.lower()

# initialize by copying from UnsafePointer
@always_inline
fn __init__(out self, height: Int, width: Int, data: UnsafePointer[Float32] = UnsafePointer[Float32](), order: String = 'c'):
fn __init__(out self, height: Int, width: Int, data: UnsafePointer[Float32, MutAnyOrigin] = UnsafePointer[Float32, MutAnyOrigin](), order: String = 'c'):
self.height = height
self.width = width
self.size = height * width
self.data = UnsafePointer[Float32].alloc(self.size)
self.data = alloc[Float32](self.size)
self.order = order.lower()
if data:
memcpy(self.data, data, self.size)
memcpy(dest=self.data, src=data, count=self.size)

fn __copyinit__(out self, other: Self):
self.height = other.height
self.width = other.width
self.size = other.size
self.data = UnsafePointer[Float32].alloc(self.size)
self.data = alloc[Float32](self.size)
self.order = other.order
memcpy(self.data, other.data, self.size)
memcpy(dest=self.data, src=other.data, count=self.size)

fn __moveinit__(out self, deinit existing: Self):
self.height = existing.height
Expand All @@ -45,7 +49,7 @@ struct Matrix(Copyable, Movable, ImplicitlyCopyable, Sized):
self.order = existing.order
#existing.height = existing.width = existing.size = 0
#existing.order = ''
#existing.data = UnsafePointer[Float32]()
#existing.data = UnsafePointer[Float32, MutAnyOrigin]()

# access an element
@always_inline
Expand All @@ -56,7 +60,7 @@ struct Matrix(Copyable, Movable, ImplicitlyCopyable, Sized):
else:
loc = (column * self.height) + row
if loc > self.size - 1 or loc < 0:
raise Error("Error: Location is out of range!")
raise Error("Location is out of range!")
return self.data[loc]

@always_inline
Expand All @@ -72,6 +76,24 @@ struct Matrix(Copyable, Movable, ImplicitlyCopyable, Sized):
fn __mul__(self, rhs: Self) raises -> Self:
if self.width != rhs.height:
raise Error('Error: Cannot multiply matrices with shapes (' + String(self.height) + ', ' + String(self.width) + ') and (' + String(rhs.height) + ', ' + String(rhs.width) + ')')

if self.height == 1 and rhs.width == 1:
# Dot product
var mat = Self(1, 1)
mat.data[0] = self.ele_mul(rhs.T()).sum()
return mat^

if self.height * self.width * rhs.width <= 4096:
# matmul naive
var mat = Self(self.height, rhs.width)
for i in range(self.size):
var rhsr = i % self.width
for j in range(rhsr * rhs.width, rhsr * rhs.width + rhs.width):
if rhsr != 0:
mat.data[(Int(i / self.width) * mat.width) + (j % rhs.width)] += self.data[i] * rhs.data[j]
else:
mat.data[(Int(i / self.width) * mat.width) + (j % rhs.width)] = self.data[i] * rhs.data[j]
return mat^
var A = matmul.Matrix[DType.float32](self.data, (self.height, self.width))
var B = matmul.Matrix[DType.float32](rhs.data, (rhs.height, rhs.width))
var C = matmul.Matrix[DType.float32]((self.height, rhs.width))
Expand All @@ -91,7 +113,6 @@ struct Matrix(Copyable, Movable, ImplicitlyCopyable, Sized):
return mat^

@staticmethod
@always_inline
fn random(height: Int, width: Int, order: String = 'c') -> Matrix:
random.seed()
var mat = Matrix(height, width, order= order)
Expand Down
70 changes: 34 additions & 36 deletions recipes/mojmelo/tests/mojmelo/utils/mojmelo_matmul/matmul.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from algorithm import vectorize, parallelize
from memory.memory import _malloc, stack_allocation
from sys import CompilationTarget, num_performance_cores, simd_width_of, size_of
import benchmark
from testing import assert_equal
from utils import IndexList
import random
from .params import *
Expand Down Expand Up @@ -37,11 +35,11 @@ struct Layout(Copyable, Movable, Writable):
var shape: IndexList[2]
var strides: IndexList[2]

fn __init__(out self, shape: (Int, Int), strides: (Int, Int)):
fn __init__(out self, shape: Tuple[Int, Int], strides: Tuple[Int, Int]):
self.shape = IndexList[2](shape[0], shape[1])
self.strides = IndexList[2](strides[0], strides[1])

fn __init__(out self, shape: (Int, Int)):
fn __init__(out self, shape: Tuple[Int, Int]):
self.strides = IndexList[2](shape[1], 1)
self.shape = IndexList[2](shape[0], shape[1])

Expand All @@ -59,31 +57,31 @@ struct Layout(Copyable, Movable, Writable):


struct Matrix[Type: DType]:
var data: UnsafePointer[Scalar[Type]]
var data: UnsafePointer[Scalar[Type], MutAnyOrigin]
var layout: Layout

fn __init__(out self, shape: (Int, Int)):
self.data = UnsafePointer[Scalar[Type]].alloc(shape[0] * shape[1])
fn __init__(out self, shape: Tuple[Int, Int]):
self.data = alloc[Scalar[Type]](shape[0] * shape[1])
self.layout = Layout(shape)

@always_inline("nodebug")
fn __init__(
out self, data: UnsafePointer[Scalar[Type]], var layout: Layout
out self, data: UnsafePointer[Scalar[Type], MutAnyOrigin], var layout: Layout
):
self.data = UnsafePointer[Scalar[Type]](data)
self.data = data
self.layout = layout

@always_inline("nodebug")
fn __init__(
out self, data: UnsafePointer[Scalar[Type]], shape: (Int, Int)
out self, data: UnsafePointer[Scalar[Type], MutAnyOrigin], shape: Tuple[Int, Int]
):
self.data = data
self.layout = Layout(shape)

@always_inline("nodebug")
fn __getitem__(
ref [_]self, i: Int, j: Int
) -> ref [__origin_of(self)] Scalar[Type]:
) -> ref [origin_of(self)] Scalar[Type]:
var offset = self.layout(i, j)
return (self.data + offset)[]

Expand Down Expand Up @@ -146,7 +144,7 @@ struct Matrix[Type: DType]:
@always_inline
fn pack_A[
Type: DType, //, mr: Int
](mc: Int, Ac_buffer: UnsafePointer[Scalar[Type]], Ac: Matrix[Type]) -> Matrix[Type]:
](mc: Int, Ac_buffer: UnsafePointer[Scalar[Type], MutAnyOrigin], Ac: Matrix[Type]) -> Matrix[Type]:
@parameter
fn pack_panel(idx: Int):
var i = idx * mr
Expand Down Expand Up @@ -184,7 +182,7 @@ fn pack_A[
@always_inline
fn pack_B[
Type: DType, //, kc: Int, nr: Int
](Bc_buffer: UnsafePointer[Scalar[Type]], Bc: Matrix[Type]) -> Matrix[Type]:
](Bc_buffer: UnsafePointer[Scalar[Type], MutAnyOrigin], Bc: Matrix[Type]) -> Matrix[Type]:
var dst_ptr = Bc_buffer
for i in range(0, Bc.shape[1](), nr):
var src_ptr = Bc.data + i
Expand Down Expand Up @@ -267,7 +265,7 @@ fn loop_n[

@parameter
fn parallelize_balanced_part(idx: Int):
var Bc_buffer = UnsafePointer[Scalar[Type]](
var Bc_buffer = UnsafePointer[Scalar[Type], MutAnyOrigin](
_malloc[Scalar[Type]](
kc * nc_per_thread * size_of[Type](), alignment=64
)
Expand All @@ -290,7 +288,7 @@ fn loop_n[

@parameter
fn parallelize_remainder(idx: Int):
var Bc_buffer = UnsafePointer[Scalar[Type]](
var Bc_buffer = UnsafePointer[Scalar[Type], MutAnyOrigin](
_malloc[Scalar[Type]](
kc * remainder_per_thread * size_of[Type](), alignment=64
)
Expand Down Expand Up @@ -348,7 +346,7 @@ fn macro_kernel[
fn micro_kernel[
Type: DType, //, mr: Int, nr: Int, padding: Bool
](mut Cr: Matrix[Type], Ar: Matrix[Type], Br: Matrix[Type]):
alias simd_width = simd_width_of[Type]()
comptime simd_width = simd_width_of[Type]()
constrained[nr % simd_width == 0, "nr must be multiple of simd_width"]()

var Ar_ptr = Ar.data
Expand Down Expand Up @@ -440,31 +438,31 @@ fn micro_kernel[

@always_inline
fn matmul_params[Type: DType]() -> IndexList[5]:
alias mc = 8192 // size_of[Type]() # fix this for simplicity
alias N = simd_width_of[Type]()
comptime mc = 8192 // size_of[Type]() # fix this for simplicity
comptime N = simd_width_of[Type]()

alias Vectors = 32 if CompilationTarget.has_avx512f() else 16
comptime Vectors = 32 if CompilationTarget.has_avx512f() else 16

@parameter
fn compute_kc[mr: Int, nr: Int]() -> Int:
alias CBr = Int((L1_ASSOCIATIVITY - 1) / (1 + mr / nr))
comptime CBr = Int((L1_ASSOCIATIVITY - 1) / (1 + mr / nr))
return (CBr * L1_CACHE_SIZE) // (nr * size_of[Type]() * L1_ASSOCIATIVITY)

@parameter
fn compute_params[C: Int]() -> IndexList[5]:
alias p = C // (intsqrt[C]() + 1)
alias mr = C // p - 1
alias nr = p * N
alias CBr = Int((L1_ASSOCIATIVITY - 1) / (1 + mr / nr))
alias kc = compute_kc[mr, nr]()
alias nc = (L2_ASSOCIATIVITY - 1) * L2_CACHE_SIZE // (
comptime p = C // (intsqrt[C]() + 1)
comptime mr = C // p - 1
comptime nr = p * N
comptime CBr = Int((L1_ASSOCIATIVITY - 1) / (1 + mr / nr))
comptime kc = compute_kc[mr, nr]()
comptime nc = (L2_ASSOCIATIVITY - 1) * L2_CACHE_SIZE // (
kc * size_of[Type]() * L2_ASSOCIATIVITY
) - mr
return IndexList[5](mc, nc, kc, mr, nr)

@parameter
if Type.is_floating_point():
alias TempVectors = 1
comptime TempVectors = 1
return compute_params[Vectors - TempVectors]()
else:

Expand All @@ -473,25 +471,25 @@ fn matmul_params[Type: DType]() -> IndexList[5]:

@parameter
if CompilationTarget.has_avx512f():
alias TempVectors = 2
comptime TempVectors = 2
return compute_params[Vectors - TempVectors]()
else:
alias TempVectors = 3
comptime TempVectors = 3
return compute_params[Vectors - TempVectors]()
else:
alias TempVectors = 2
comptime TempVectors = 2
return compute_params[Vectors - TempVectors]()


fn matmul[
Type: DType
](m: Int, n: Int, k: Int, mut C: Matrix[Type], A: Matrix[Type], B: Matrix[Type]):
alias params = matmul_params[Type]()
alias mc = params[0]
alias nc = params[1]
alias kc = params[2]
alias mr = params[3]
alias nr = params[4]
comptime params = matmul_params[Type]()
comptime mc = params[0]
comptime nc = params[1]
comptime kc = params[2]
comptime mr = params[3]
comptime nr = params[4]
var resized_mc = roundup(min(mc, m), mr)
var resized_nc = roundup(min(nc, n), nr)
matmul_impl[kc, mr, nr](resized_mc, resized_nc, C, A, B)
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
alias L1_CACHE_SIZE = 32768
alias L1_ASSOCIATIVITY = 8
alias L2_CACHE_SIZE = 262144
alias L2_ASSOCIATIVITY = 4
comptime L1_CACHE_SIZE = 32768
comptime L1_ASSOCIATIVITY = 8
comptime L2_CACHE_SIZE = 262144
comptime L2_ASSOCIATIVITY = 4
Loading