Skip to content

Commit e8250bc

Browse files
authored
Merge pull request #4 from Node0/main
Added true tokenizer stats
2 parents 405b153 + e2fcbc9 commit e8250bc

File tree

3 files changed

+237
-31
lines changed

3 files changed

+237
-31
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import json
5+
import http.client
6+
import sys
7+
8+
def list_models(hostname, port):
9+
"""List available models from the Ollama server."""
10+
endpoint = "/api/tags" # Correct endpoint for listing local models
11+
connection = http.client.HTTPConnection(hostname, port)
12+
try:
13+
connection.request("GET", endpoint)
14+
response = connection.getresponse()
15+
16+
if response.status != 200:
17+
print(f"Error: HTTP {response.status} - {response.reason}")
18+
return
19+
20+
data = json.loads(response.read().decode("utf-8"))
21+
models = data.get("models", [])
22+
23+
print("Available Models:\n")
24+
for model in models:
25+
name = model.get("name", "Unknown")
26+
size = model.get("size", 0)
27+
modified_at = model.get("modified_at", "Unknown")
28+
details = model.get("details", {})
29+
parameter_size = details.get("parameter_size", "Unknown")
30+
quantization = details.get("quantization_level", "Unknown")
31+
format_ = details.get("format", "Unknown")
32+
33+
# Display model information
34+
print(f"Name: {name}")
35+
print(f" Size: {size / 1_000_000_000:.2f} GB") # Convert bytes to GB
36+
print(f" Modified At: {modified_at}")
37+
print(f" Parameters: {parameter_size}")
38+
print(f" Quantization Level: {quantization}")
39+
print(f" Format: {format_}")
40+
print("-" * 40)
41+
42+
finally:
43+
connection.close()
44+
45+
46+
def stream_ollama_response(hostname, port, model, prompt, system_prompt, temperature, max_tokens):
47+
"""Stream responses from the Ollama server."""
48+
endpoint = "/api/generate"
49+
headers = {"Content-Type": "application/json"}
50+
payload = json.dumps({
51+
"model": model,
52+
"prompt": prompt,
53+
"system": system_prompt,
54+
"temperature": temperature,
55+
"max_tokens": max_tokens
56+
})
57+
58+
connection = http.client.HTTPConnection(hostname, port)
59+
try:
60+
connection.request("POST", endpoint, body=payload, headers=headers)
61+
response = connection.getresponse()
62+
63+
if response.status != 200:
64+
print(f"Error: HTTP {response.status} - {response.reason}")
65+
return
66+
67+
concatenated_response = ""
68+
for line in response:
69+
line = line.decode("utf-8").strip()
70+
if line:
71+
try:
72+
response_data = json.loads(line)
73+
word = response_data.get("response", "")
74+
concatenated_response += word
75+
print(word, end="", flush=True)
76+
except json.JSONDecodeError:
77+
print(f"\nInvalid JSON: {line}")
78+
79+
print("\n\nFinal Response:")
80+
print(concatenated_response)
81+
82+
finally:
83+
connection.close()
84+
85+
def main():
86+
parser = argparse.ArgumentParser(description="Interact with Ollama server.")
87+
parser.add_argument("--ollama-hostname", required=True, help="Hostname of the Ollama server")
88+
parser.add_argument("--ollama-port", type=int, required=True, help="Port number of the Ollama server")
89+
90+
# Mutually exclusive arguments
91+
group = parser.add_mutually_exclusive_group(required=True)
92+
group.add_argument("--list-models", action="store_true",
93+
help="List all available models on the Ollama server")
94+
group.add_argument("--model", help="Specify the model to use")
95+
96+
# Other arguments
97+
parser.add_argument("--prompt", help="Prompt to send to the Ollama server")
98+
parser.add_argument("--system-prompt", default="You are a helpful assistant.",
99+
help="System prompt to define the model's behavior")
100+
parser.add_argument("--temperature", type=float, default=0.7,
101+
help="Temperature for response randomness")
102+
parser.add_argument("--max-tokens", type=int, default=100,
103+
help="Maximum number of tokens to generate")
104+
105+
args = parser.parse_args()
106+
107+
# If --list-models is specified, list models and exit
108+
if args.list_models:
109+
list_models(args.ollama_hostname, args.ollama_port)
110+
return
111+
112+
# Validate other required arguments
113+
if not args.prompt:
114+
print("Error: --prompt is required when --list-models is not specified.")
115+
sys.exit(1)
116+
117+
if not args.model:
118+
print("Error: --model is required when --list-models is not specified.")
119+
sys.exit(1)
120+
121+
# Stream response from the server
122+
stream_ollama_response(
123+
args.ollama_hostname,
124+
args.ollama_port,
125+
args.model,
126+
args.prompt,
127+
args.system_prompt,
128+
args.temperature,
129+
args.max_tokens
130+
)
131+
132+
if __name__ == "__main__":
133+
main()
134+

python-ideas/tkps_vs_gpu_count_opimizer.py renamed to python-tools/tkps_vs_gpu_count_opimizer.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,24 @@
11
#!/usr/bin/env python3
22

3+
# This program is free software: you can redistribute it and/or modify
4+
# it under the terms of the GNU General Public License as published by
5+
# the Free Software Foundation, either version 3 of the License, or
6+
# (at your option) any later version.
7+
#
8+
# This program is distributed in the hope that it will be useful,
9+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11+
# GNU General Public License for more details.
12+
#
13+
# You should have received a copy of the GNU General Public License
14+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
15+
#
16+
# Copyright (C) 2025 Joe Hacobian
17+
18+
319
import argparse
420

521
# Constants
6-
CONTINENTS = 7
722
WORDS_PER_ARTICLE = 2500
823
TOKENS_PER_WORD = 1.88
924
SECONDS_PER_HOUR = 3600
@@ -13,13 +28,13 @@
1328
debug = False
1429

1530
# Function to calculate processing time
16-
def calculate_time(tkps, gpu_count, articles_per_continent):
31+
def calculate_time(tkps, gpu_count, articles_per_continent, continents):
1732
if tkps <= 0 or gpu_count <= 0:
1833
if debug:
1934
print(f"[DEBUG] Invalid tkps ({tkps}) or gpu_count ({gpu_count}), returning infinity.")
2035
return float('inf') # Avoid division by zero
2136

22-
total_tokens = articles_per_continent * CONTINENTS * WORDS_PER_ARTICLE * TOKENS_PER_WORD
37+
total_tokens = articles_per_continent * continents * WORDS_PER_ARTICLE * TOKENS_PER_WORD
2338
tokens_processed_per_second = tkps * gpu_count
2439
tokens_processed_per_hour = tokens_processed_per_second * SECONDS_PER_HOUR
2540
time_hours = total_tokens / tokens_processed_per_hour
@@ -30,17 +45,17 @@ def calculate_time(tkps, gpu_count, articles_per_continent):
3045
return time_minutes
3146

3247
# Optimized function to find the optimal configuration
33-
def find_optimal_configuration(target_time, max_tkps, max_gpus, articles_per_continent):
48+
def find_optimal_configuration(target_time, max_tkps, max_gpus, articles_per_continent, continents):
3449
best_configuration = None
3550
closest_time = float('inf')
3651

3752
if debug:
38-
print(f"[DEBUG] Starting optimization: target_time={target_time}, max_tkps={max_tkps}, max_gpus={max_gpus}, articles_per_continent={articles_per_continent}")
53+
print(f"[DEBUG] Starting optimization: target_time={target_time}, max_tkps={max_tkps}, max_gpus={max_gpus}, articles_per_continent={articles_per_continent}, continents={continents}")
3954

4055
# Start by maximizing tkps per GPU and minimizing GPU count
4156
for tkps in range(max_tkps, 0, -1): # Start with the highest feasible tkps
4257
for gpu_count in range(1, max_gpus + 1): # Start with one GPU
43-
current_time = calculate_time(tkps, gpu_count, articles_per_continent)
58+
current_time = calculate_time(tkps, gpu_count, articles_per_continent, continents)
4459

4560
if debug:
4661
print(f"[DEBUG] Checking configuration -> tkps: {tkps}, gpu_count: {gpu_count}, current_time: {current_time:.2f}")
@@ -91,6 +106,12 @@ def main():
91106
default=70,
92107
help="Maximum number of GPUs available (default: 70)."
93108
)
109+
parser.add_argument(
110+
"--continents",
111+
type=int,
112+
default=7,
113+
help="Number of continents of newsfeeds (default: 7)."
114+
)
94115
parser.add_argument(
95116
"--debug",
96117
action="store_true",
@@ -101,19 +122,25 @@ def main():
101122
debug = args.debug
102123

103124
if debug:
104-
print(f"[DEBUG] Input arguments -> articles_per_continent: {args.articles_per_continent}, target_time: {args.target_time}, max_tkps: {args.max_tkps}, max_gpus: {args.max_gpus}")
125+
print(f"[DEBUG] Input arguments -> articles_per_continent: {args.articles_per_continent}, target_time: {args.target_time}, max_tkps: {args.max_tkps}, max_gpus: {args.max_gpus}, continents: {args.continents}")
105126

106127
# Execute optimization
107128
optimal_config, achieved_time = find_optimal_configuration(
108129
target_time=args.target_time,
109130
max_tkps=args.max_tkps,
110131
max_gpus=args.max_gpus,
111-
articles_per_continent=args.articles_per_continent
132+
articles_per_continent=args.articles_per_continent,
133+
continents=args.continents
112134
)
113135

114136
if optimal_config:
115137
optimal_tkps, optimal_gpus = optimal_config
116-
print(f"Optimal Configuration to achieve ~{args.target_time} minutes:")
138+
print("Optimal Configuration to achieve:")
139+
print(f"Target max processing time: <= {args.target_time} minutes")
140+
print(f"Number of continents of newsfeeds: {args.continents}")
141+
print(f"Number of articles per continent per 3 hours: {args.articles_per_continent}")
142+
print()
143+
print("Optimized compute resources estimated for processing time goal:")
117144
print(f"Tokens per second (tkps) per GPU: {optimal_tkps}")
118145
print(f"Number of GPUs: {optimal_gpus}")
119146
print(f"Achieved Processing Time: {achieved_time:.2f} minutes")

0 commit comments

Comments
 (0)