Publish kv-cache events (#126)

irar2 · smarunich · commit e2a7c974870a · 2025-08-14T17:40:48.000-04:00
* Publish kv-cache events

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

* Fix lint errors

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

* Review fixes

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

* Sleep to allow prevous sub to close

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

---------

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;
Signed-off-by: Sergey Marunich &lt;marunich.s@gmail.com&gt;
diff --git a/Makefile b/Makefile
@@ -39,16 +39,14 @@ help: ## Print help
 LDFLAGS ?= -extldflags '-L$(shell pwd)/lib'
 CGO_ENABLED=1
 TOKENIZER_LIB = lib/libtokenizers.a
-# Extract TOKENIZER_VERSION from Dockerfile
-TOKENIZER_VERSION := $(shell grep '^ARG TOKENIZER_VERSION=' Dockerfile | cut -d'=' -f2)
 
 .PHONY: download-tokenizer
 download-tokenizer: $(TOKENIZER_LIB)
 $(TOKENIZER_LIB):
 	## Download the HuggingFace tokenizer bindings.
-	@echo "Downloading HuggingFace tokenizer bindings for version $(TOKENIZER_VERSION)..."
+	@echo "Downloading HuggingFace tokenizer bindings..."
 	mkdir -p lib
-	curl -L https://github.com/daulet/tokenizers/releases/download/$(TOKENIZER_VERSION)/libtokenizers.$(TARGETOS)-$(TARGETARCH).tar.gz | tar -xz -C lib
+	curl -L https://github.com/daulet/tokenizers/releases/download/v1.22.1/libtokenizers.$(TARGETOS)-$(TARGETARCH).tar.gz | tar -xz -C lib
 	ranlib lib/*.a
 
 ##@ Development
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -125,6 +125,8 @@ type Configuration struct {
 
 	// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
 	ZMQEndpoint string `yaml:"zmq-endpoint"`
+	// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
+	EventBatchSize int `yaml:"event-batch-size"`
 }
 
 type LoraModule struct {
@@ -183,6 +185,7 @@ func newConfig() *Configuration {
 		KVCacheSize:    1024,
 		TokenBlockSize: 16,
 		ZMQEndpoint:    "tcp://localhost:5557",
+		EventBatchSize: 16,
 	}
 }
 
@@ -293,6 +296,9 @@ func (c *Configuration) validate() error {
 	if c.KVCacheSize < 0 {
 		return errors.New("KV cache size cannot be negative")
 	}
+	if c.EventBatchSize < 1 {
+		return errors.New("event batch size cannot less than 1")
+	}
 	return nil
 }
 
@@ -344,6 +350,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.StringVar(&config.TokenizersCacheDir, "tokenizers-cache-dir", config.TokenizersCacheDir, "Directory for caching tokenizers")
 	f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")
 	f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")
+	f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")
 
 	// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help
 	var dummyString string

Original file line number	Diff line number	Diff line change
`@@ -125,6 +125,8 @@ type Configuration struct {`
`125`	`125`
`126`	`126`	`// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557`
`127`	`127`	ZMQEndpoint string `yaml:"zmq-endpoint"`
	`128`	`+ // EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16`
	`129`	+ EventBatchSize int `yaml:"event-batch-size"`
`128`	`130`	`}`
`129`	`131`
`130`	`132`	`type LoraModule struct {`
`@@ -183,6 +185,7 @@ func newConfig() *Configuration {`
`183`	`185`	`KVCacheSize: 1024,`
`184`	`186`	`TokenBlockSize: 16,`
`185`	`187`	`ZMQEndpoint: "tcp://localhost:5557",`
	`188`	`+ EventBatchSize: 16,`
`186`	`189`	`}`
`187`	`190`	`}`
`188`	`191`
`@@ -293,6 +296,9 @@ func (c *Configuration) validate() error {`
`293`	`296`	`if c.KVCacheSize < 0 {`
`294`	`297`	`return errors.New("KV cache size cannot be negative")`
`295`	`298`	`}`
	`299`	`+ if c.EventBatchSize < 1 {`
	`300`	`+ return errors.New("event batch size cannot less than 1")`
	`301`	`+ }`
`296`	`302`	`return nil`
`297`	`303`	`}`
`298`	`304`
`@@ -344,6 +350,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {`
`344`	`350`	`f.StringVar(&config.TokenizersCacheDir, "tokenizers-cache-dir", config.TokenizersCacheDir, "Directory for caching tokenizers")`
`345`	`351`	`f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")`
`346`	`352`	`f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")`
	`353`	`+ f.IntVar(&config.EventBatchSize, "event-batch-size", config.EventBatchSize, "Maximum number of kv-cache events to be sent together")`
`347`	`354`
`348`	`355`	`// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help`
`349`	`356`	`var dummyString string`