Skip to content

Commit 01afb91

Browse files
authored
Merge pull request #289 from chdb-io/patchset-2.2.0b1
Fix starving caused by GIL
2 parents 59075e8 + 09da2d5 commit 01afb91

File tree

5 files changed

+147
-3
lines changed

5 files changed

+147
-3
lines changed

src/Processors/Sources/PythonSource.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ extern const int PY_EXCEPTION_OCCURED;
4848

4949
PythonSource::PythonSource(
5050
py::object & data_source_,
51+
bool isInheritsFromPyReader_,
5152
const Block & sample_block_,
5253
PyColumnVecPtr column_cache,
5354
size_t data_source_row_count,
@@ -56,6 +57,7 @@ PythonSource::PythonSource(
5657
size_t num_streams)
5758
: ISource(sample_block_.cloneEmpty())
5859
, data_source(data_source_)
60+
, isInheritsFromPyReader(isInheritsFromPyReader_)
5961
, sample_block(sample_block_)
6062
, column_cache(column_cache)
6163
, data_source_row_count(data_source_row_count)
@@ -544,7 +546,7 @@ Chunk PythonSource::generate()
544546

545547
try
546548
{
547-
if (isInheritsFromPyReader(data_source))
549+
if (isInheritsFromPyReader)
548550
{
549551
PyObjectVecPtr data;
550552
py::gil_scoped_acquire acquire;

src/Processors/Sources/PythonSource.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class PythonSource : public ISource
2626
public:
2727
PythonSource(
2828
py::object & data_source_,
29+
bool isInheritsFromPyReader_,
2930
const Block & sample_block_,
3031
PyColumnVecPtr column_cache,
3132
size_t data_source_row_count,
@@ -42,6 +43,7 @@ class PythonSource : public ISource
4243

4344
private:
4445
py::object & data_source; // Do not own the reference
46+
bool isInheritsFromPyReader; // If the data_source is a PyReader object
4547

4648
Block sample_block;
4749
PyColumnVecPtr column_cache;

src/Storages/StoragePython.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ Pipe StoragePython::read(
7070

7171
if (isInheritsFromPyReader(data_source))
7272
{
73-
return Pipe(std::make_shared<PythonSource>(data_source, sample_block, column_cache, data_source_row_count, max_block_size, 0, 1));
73+
return Pipe(
74+
std::make_shared<PythonSource>(data_source, true, sample_block, column_cache, data_source_row_count, max_block_size, 0, 1));
7475
}
7576

7677
prepareColumnCache(column_names, sample_block.getColumns(), sample_block);
@@ -79,7 +80,7 @@ Pipe StoragePython::read(
7980
// num_streams = 32; // for chdb testing
8081
for (size_t stream = 0; stream < num_streams; ++stream)
8182
pipes.emplace_back(std::make_shared<PythonSource>(
82-
data_source, sample_block, column_cache, data_source_row_count, max_block_size, stream, num_streams));
83+
data_source, false, sample_block, column_cache, data_source_row_count, max_block_size, stream, num_streams));
8384
return Pipe::unitePipes(std::move(pipes));
8485
}
8586

tests/queries.sql

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
SELECT COUNT(*) FROM Python(hits);
2+
SELECT COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0;
3+
SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM Python(hits);
4+
SELECT AVG(UserID) FROM Python(hits);
5+
SELECT COUNT(DISTINCT UserID) FROM Python(hits);
6+
SELECT COUNT(DISTINCT SearchPhrase) FROM Python(hits);
7+
SELECT MIN(EventDate), MAX(EventDate) FROM Python(hits);
8+
SELECT AdvEngineID, COUNT(*) FROM Python(hits) WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;
9+
SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM Python(hits) GROUP BY RegionID ORDER BY u DESC LIMIT 10;
10+
SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM Python(hits) GROUP BY RegionID ORDER BY c DESC LIMIT 10;
11+
SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
12+
SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
13+
SELECT SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
14+
SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
15+
SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
16+
SELECT UserID, COUNT(*) FROM Python(hits) GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
17+
SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
18+
SELECT UserID, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, SearchPhrase LIMIT 10;
19+
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM Python(hits) GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
20+
SELECT UserID FROM Python(hits) WHERE UserID = 435090932899640449;
21+
SELECT COUNT(*) FROM Python(hits) WHERE URL LIKE '%google%';
22+
SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM Python(hits) WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
23+
SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM Python(hits) WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
24+
SELECT * FROM Python(hits) WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;
25+
SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;
26+
SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;
27+
SELECT SearchPhrase FROM Python(hits) WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;
28+
SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM Python(hits) WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
29+
SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM Python(hits) WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
30+
SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM Python(hits);
31+
SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
32+
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
33+
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM Python(hits) GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
34+
SELECT URL, COUNT(*) AS c FROM Python(hits) GROUP BY URL ORDER BY c DESC LIMIT 10;
35+
SELECT 1, URL, COUNT(*) AS c FROM Python(hits) GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
36+
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM Python(hits) GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
37+
SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
38+
SELECT Title, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
39+
SELECT URL, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
40+
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
41+
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
42+
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43+
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM Python(hits) WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000

tests/test_state2_dataframe.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/usr/bin/env python3
2+
3+
import unittest
4+
import timeit
5+
import datetime
6+
import json
7+
import tempfile
8+
import pandas as pd
9+
import chdb
10+
import os
11+
from urllib.request import urlretrieve
12+
13+
14+
class TestChDBDataFrame(unittest.TestCase):
15+
@classmethod
16+
def setUpClass(cls):
17+
# Download parquet file if it doesn't exist
18+
parquet_file = "hits_0.parquet"
19+
if not os.path.exists(parquet_file):
20+
print(f"Downloading {parquet_file}...")
21+
url = "https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_0.parquet"
22+
urlretrieve(url, parquet_file)
23+
print("Download complete!")
24+
25+
# Load data and prepare DataFrame
26+
cls.hits = pd.read_parquet(parquet_file)
27+
cls.dataframe_size = cls.hits.memory_usage().sum()
28+
29+
# Fix types
30+
cls.hits["EventTime"] = pd.to_datetime(cls.hits["EventTime"], unit="s")
31+
cls.hits["EventDate"] = pd.to_datetime(cls.hits["EventDate"], unit="D")
32+
33+
# Convert object columns to string
34+
for col in cls.hits.columns:
35+
if cls.hits[col].dtype == "O":
36+
cls.hits[col] = cls.hits[col].astype(str)
37+
38+
# Load queries
39+
with open("queries.sql") as f:
40+
cls.queries = f.readlines()
41+
42+
def setUp(self):
43+
self.tmp_dir = tempfile.TemporaryDirectory()
44+
self.conn = chdb.connect(f"{self.tmp_dir.name}")
45+
46+
def tearDown(self):
47+
self.conn.close()
48+
self.tmp_dir.cleanup()
49+
50+
def test_dataframe_size(self):
51+
self.assertGreater(self.dataframe_size, 0, "DataFrame size should be positive")
52+
53+
def test_query_execution(self):
54+
queries_times = []
55+
for i, query in enumerate(self.queries, 1):
56+
times = []
57+
for _ in range(3):
58+
start = timeit.default_timer()
59+
result = self.conn.query(query, "CSV")
60+
end = timeit.default_timer()
61+
times.append(end - start)
62+
63+
# Verify query results are not empty
64+
self.assertIsNotNone(result, f"Query {i} returned None")
65+
66+
queries_times.append(times)
67+
# Verify execution times are reasonable
68+
self.assertTrue(
69+
all(t > 0 for t in times), f"Query {i} has invalid execution times"
70+
)
71+
72+
result_json = {
73+
"system": "chDB 2.2(DataFrame)",
74+
"date": datetime.date.today().strftime("%Y-%m-%d"),
75+
"machine": "",
76+
"cluster_size": 1,
77+
"comment": "",
78+
"tags": [
79+
"C++",
80+
"column-oriented",
81+
"embedded",
82+
"stateless",
83+
"serverless",
84+
"dataframe",
85+
"ClickHouse derivative",
86+
],
87+
"load_time": 0,
88+
"data_size": int(self.dataframe_size),
89+
"result": queries_times, # Will be populated during test_query_execution
90+
}
91+
92+
print(json.dumps(result_json, indent=2))
93+
94+
95+
if __name__ == "__main__":
96+
unittest.main()

0 commit comments

Comments
 (0)