feat: add lora support

mzbac · mzbac · commit 3021ed005c78 · 2024-10-13T23:05:52.000+11:00
diff --git a/Sources/FLUX.swift b/Sources/FLUX.swift
@@ -7,12 +7,47 @@ import Tokenizers
 
 open class FLUX {
 
+  internal func loadLoraWeights(hub: HubApi, loraPath: String, dType: DType) async throws
+    -> [String: MLXArray]
+  {
+    let loraDirectory: URL
+    if FileManager.default.fileExists(atPath: loraPath) {
+      loraDirectory = URL(fileURLWithPath: loraPath)
+    } else {
+      let repo = Hub.Repo(id: loraPath)
+      try await hub.snapshot(from: repo, matching: ["*.safetensors"])
+      loraDirectory = hub.localRepoLocation(repo)
+    }
+
+    return try Self.loadLoraWeights(directory: loraDirectory, dType: dType)
+  }
+
+  internal static func loadLoraWeights(directory: URL, dType: DType) throws -> [String: MLXArray] {
+    var loraWeights = [String: MLXArray]()
+    let enumerator = FileManager.default.enumerator(
+      at: directory, includingPropertiesForKeys: nil)!
+    for case let url as URL in enumerator {
+      if url.pathExtension == "safetensors" {
+        let w = try loadArrays(url: url)
+        for (key, value) in w {
+          let newKey = remapWeightKey(key)
+          if value.dtype != .bfloat16 {
+            loraWeights[newKey] = value.asType(dType)
+          } else {
+            loraWeights[newKey] = value
+          }
+        }
+      }
+    }
+    return loraWeights
+  }
+
   internal static func remapWeightKey(_ key: String) -> String {
-    if (key.contains(".ff.") || key.contains(".ff_context.")) {
+    if key.contains(".ff.") || key.contains(".ff_context.") {
       let components = key.components(separatedBy: ".")
       if components.count >= 5 {
         let blockIndex = components[1]
-        let ffType = components[2] // "ff" or "ff_context"
+        let ffType = components[2]  // "ff" or "ff_context"
         let netIndex = components[4]
 
         if netIndex == "0" {
@@ -89,7 +124,7 @@ open class FLUX {
     {
       t5Weights["relative_attention_bias.weight"] = relativeAttentionBias
     }
-    
+
     t5Encoder.update(parameters: ModuleParameters.unflattened(t5Weights))
     return t5Encoder
   }
@@ -112,8 +147,8 @@ open class FLUX {
 public class Flux1Schnell: FLUX, TextToImageGenerator {
   let clipTokenizer: CLIPTokenizer
   let t5Tokenizer: any Tokenizer
-  let vae: VAE
   let transformer: MultiModalDiffusionTransformer
+  let vae: VAE
   let t5Encoder: T5Encoder
   let clipEncoder: CLIPEncoder
 
@@ -154,7 +189,12 @@ public class Flux1Schnell: FLUX, TextToImageGenerator {
 
   public func generateLatents(parameters: EvaluateParameters) -> DenoiseIterator {
     let latentsShape = [1, (parameters.height / 16) * (parameters.width / 16), 64]
-    let latents = MLXRandom.normal(latentsShape, key: MLXRandom.key(parameters.seed))
+    let latents: MLXArray
+    if let seed = parameters.seed {
+      latents = MLXRandom.normal(latentsShape, key: MLXRandom.key(seed))
+    } else {
+      latents = MLXRandom.normal(latentsShape)
+    }
     let (promptEmbeddings, pooledPromptEmbeddings) = conditionText(prompt: parameters.prompt)
 
     return DenoiseIterator(
@@ -203,8 +243,8 @@ public class Flux1Schnell: FLUX, TextToImageGenerator {
 public class Flux1Dev: FLUX, TextToImageGenerator {
   let clipTokenizer: CLIPTokenizer
   let t5Tokenizer: any Tokenizer
-  let vae: VAE
   let transformer: MultiModalDiffusionTransformer
+  let vae: VAE
   let t5Encoder: T5Encoder
   let clipEncoder: CLIPEncoder
 
@@ -222,7 +262,8 @@ public class Flux1Dev: FLUX, TextToImageGenerator {
   private static func loadTransformer(directory: URL, dType: DType) throws
     -> MultiModalDiffusionTransformer
   {
-    let transformer = MultiModalDiffusionTransformer(MultiModalDiffusionConfiguration(guidanceEmbeds: true))
+    let transformer = MultiModalDiffusionTransformer(
+      MultiModalDiffusionConfiguration(guidanceEmbeds: true))
     var transformerWeights = [String: MLXArray]()
     let enumerator = FileManager.default.enumerator(
       at: directory.appending(path: "transformer"), includingPropertiesForKeys: nil)!
@@ -245,7 +286,12 @@ public class Flux1Dev: FLUX, TextToImageGenerator {
 
   public func generateLatents(parameters: EvaluateParameters) -> DenoiseIterator {
     let latentsShape = [1, (parameters.height / 16) * (parameters.width / 16), 64]
-    let latents = MLXRandom.normal(latentsShape, key: MLXRandom.key(parameters.seed))
+    let latents: MLXArray
+    if let seed = parameters.seed {
+      latents = MLXRandom.normal(latentsShape, key: MLXRandom.key(seed))
+    } else {
+      latents = MLXRandom.normal(latentsShape)
+    }
     let (promptEmbeddings, pooledPromptEmbeddings) = conditionText(prompt: parameters.prompt)
 
     return DenoiseIterator(
@@ -348,4 +394,4 @@ public struct DenoiseIterator: Sequence, IteratorProtocol {
     i += 1
     return latents
   }
-}
+}
diff --git a/Sources/FluxConfiguration.swift b/Sources/FluxConfiguration.swift
@@ -11,13 +11,17 @@ public struct LoadConfiguration: Sendable {
   /// quantize weights
   public var quantize = false
 
+  public let loraPath: String?
   public var dType: DType {
     float16 ? .float16 : .float32
   }
 
-  public init(float16: Bool = true, quantize: Bool = false) {
+  public init(
+    float16: Bool = true, quantize: Bool = false, loraPath: String? = nil
+  ) {
     self.float16 = float16
     self.quantize = quantize
+    self.loraPath = loraPath
   }
 }
 
@@ -26,14 +30,20 @@ public struct EvaluateParameters {
   public var height: Int
   public var numInferenceSteps: Int
   public var guidance: Float
-  public var seed: UInt64
+  public var seed: UInt64?
   public var prompt: String
   public var numTrainSteps: Int
   public let sigmas: MLXArray
 
   public init(
-    numInferenceSteps: Int = 4, width: Int = 1024, height: Int = 1024, guidance: Float = 4.0,
-    seed: UInt64 = 0, prompt: String = "", numTrainSteps: Int = 1000, shiftSigmas: Bool = false
+    width: Int = 512,
+    height: Int = 512,
+    numInferenceSteps: Int = 4,
+    guidance: Float = 4.0,
+    seed: UInt64? = nil,
+    prompt: String = "",
+    numTrainSteps: Int = 1000,
+    shiftSigmas: Bool = false
   ) {
     if width % 16 != 0 || height % 16 != 0 {
       print("Warning: Width and height should be multiples of 16. Rounding down.")
@@ -77,12 +87,36 @@ enum FileKey {
   case tokenizer2
 }
 
+// TODO: add support for mlx flux fine-tuning
+func fuseLoraWeights(
+  transform: Module, transformerWeight: [String: MLXArray], loraWeight: [String: MLXArray]
+) -> [String: MLXArray] {
+  var fusedWeights = transformerWeight
+
+  for (key, value) in transform.namedModules() {
+    if let _ = value as? Linear {
+      let loraAKey = "transformer." + key + ".lora_A.weight"
+      let loraBKey = "transformer." + key + ".lora_B.weight"
+      let weightKey = key + ".weight"
+
+      if let loraA = loraWeight[loraAKey], let loraB = loraWeight[loraBKey],
+        let transformerWeight = fusedWeights[weightKey]
+      {
+        let loraScale: Float = 1.0 
+        let loraFused = MLX.matmul(loraB, loraA)
+        fusedWeights[weightKey] = transformerWeight + loraScale * loraFused
+      }
+    }
+  }
+  return fusedWeights
+}
+
 public struct FluxConfiguration: Sendable {
-  public let id: String
+  public var id: String
   let files: [FileKey: String]
   public let defaultParameters: @Sendable () -> EvaluateParameters
   let factory:
-    @Sendable (HubApi, FluxConfiguration, LoadConfiguration) throws ->
+    @Sendable (HubApi, FluxConfiguration, LoadConfiguration) async throws ->
       FLUX
 
   public func download(
@@ -94,9 +128,9 @@ public struct FluxConfiguration: Sendable {
   }
 
   public func textToImageGenerator(hub: HubApi = HubApi(), configuration: LoadConfiguration)
-    throws -> TextToImageGenerator?
+    async throws -> TextToImageGenerator?
   {
-    try factory(hub, self, configuration) as? TextToImageGenerator
+    try await factory(hub, self, configuration) as? TextToImageGenerator
   }
 
   public static let flux1Schnell = FluxConfiguration(
@@ -113,6 +147,20 @@ public struct FluxConfiguration: Sendable {
     factory: { hub, fluxConfiguration, loadConfiguration in
       let flux = try Flux1Schnell(
         hub: hub, configuration: fluxConfiguration, dType: loadConfiguration.dType)
+
+      if let loraPath = loadConfiguration.loraPath {
+        let loraWeight = try await flux.loadLoraWeights(
+          hub: hub, loraPath: loraPath, dType: loadConfiguration.dType)
+
+        let weights = fuseLoraWeights(
+          transform: flux.transformer,
+          transformerWeight: Dictionary(
+            uniqueKeysWithValues: flux.transformer.parameters().flattened()), loraWeight: loraWeight
+        )
+
+        flux.transformer.update(parameters: ModuleParameters.unflattened(weights))
+      }
+
       if loadConfiguration.quantize {
         quantize(model: flux.clipEncoder, filter: { k, m in m is Linear })
         quantize(model: flux.t5Encoder, filter: { k, m in m is Linear })
@@ -141,6 +189,20 @@ public struct FluxConfiguration: Sendable {
     factory: { hub, fluxConfiguration, loadConfiguration in
       let flux = try Flux1Dev(
         hub: hub, configuration: fluxConfiguration, dType: loadConfiguration.dType)
+
+      if let loraPath = loadConfiguration.loraPath {
+        let loraWeight = try await flux.loadLoraWeights(
+          hub: hub, loraPath: loraPath, dType: loadConfiguration.dType)
+
+        let weights = fuseLoraWeights(
+          transform: flux.transformer,
+          transformerWeight: Dictionary(
+            uniqueKeysWithValues: flux.transformer.parameters().flattened()), loraWeight: loraWeight
+        )
+
+        flux.transformer.update(parameters: ModuleParameters.unflattened(weights))
+      }
+
       if loadConfiguration.quantize {
         quantize(model: flux.clipEncoder, filter: { k, m in m is Linear })
         quantize(model: flux.t5Encoder, filter: { k, m in m is Linear })