Skip to content

Commit a3c23ab

Browse files
bjornalmHarshg999
andauthored
[metastore] add config flag to fetch sample data for views (#4172)
- This commit introduces a new configuration flag, allow_sample_data_from_views, to control whether Hue attempts to fetch sample data for database views. - Previously, fetching sample data from views was disabled by default due to potential performance concerns with complex or long-running views. This change makes this behavior configurable. - Default value: false (maintains previous behavior) - If set to true, Hue will attempt to fetch sample data for views. --------- Co-authored-by: Harsh Gupta <[email protected]>
1 parent 727430a commit a3c23ab

File tree

10 files changed

+128
-92
lines changed

10 files changed

+128
-92
lines changed

apps/beeswax/src/beeswax/api.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
import re
1918
import json
2019
import logging
20+
import re
2121
from builtins import zip
2222

2323
from django.http import Http404
@@ -33,7 +33,7 @@
3333
from beeswax.forms import QueryForm
3434
from beeswax.models import QueryHistory, Session
3535
from beeswax.server import dbms
36-
from beeswax.server.dbms import QueryServerException, QueryServerTimeoutException, SubQueryTable, expand_exception, get_query_server_config
36+
from beeswax.server.dbms import expand_exception, get_query_server_config, QueryServerException, QueryServerTimeoutException, SubQueryTable
3737
from beeswax.views import (
3838
_get_query_handle_and_state,
3939
authorized_get_design,
@@ -52,9 +52,9 @@
5252
from desktop.lib.i18n import force_unicode
5353
from desktop.lib.parameterization import substitute_variables
5454
from metastore import parser
55-
from metastore.conf import FORCE_HS2_METADATA
55+
from metastore.conf import ALLOW_SAMPLE_DATA_FROM_VIEWS, FORCE_HS2_METADATA
5656
from metastore.views import _get_db, _get_servername
57-
from notebook.models import MockedDjangoRequest, escape_rows, make_notebook
57+
from notebook.models import escape_rows, make_notebook, MockedDjangoRequest
5858
from useradmin.models import User
5959

6060
LOG = logging.getLogger()
@@ -606,7 +606,7 @@ def save_results_hdfs_file(request, query_history_id):
606606

607607
try:
608608
handle, state = _get_query_handle_and_state(query_history)
609-
except Exception as ex:
609+
except Exception:
610610
response['message'] = _('Cannot find query handle and state: %s') % str(query_history)
611611
response['status'] = -2
612612
return JsonResponse(response)
@@ -669,7 +669,7 @@ def save_results_hive_table(request, query_history_id):
669669
try:
670670
handle, state = _get_query_handle_and_state(query_history)
671671
result_meta = db.get_results_metadata(handle)
672-
except Exception as ex:
672+
except Exception:
673673
response['message'] = _('Cannot find query handle and state: %s') % str(query_history)
674674
response['status'] = -2
675675
return JsonResponse(response)
@@ -733,7 +733,7 @@ def _get_sample_data(db, database, table, column, nested, is_async=False, cluste
733733
query_server = get_query_server_config('impala', connector=cluster)
734734
db = dbms.get(db.client.user, query_server, cluster=cluster)
735735

736-
if table_obj and table_obj.is_view:
736+
if table_obj and table_obj.is_view and not ALLOW_SAMPLE_DATA_FROM_VIEWS.get():
737737
response = {'status': -1}
738738
response['message'] = _('Not getting sample data as this is a view which can be expensive when run.')
739739
return response

apps/beeswax/src/beeswax/api_tests.py

Lines changed: 80 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -16,25 +16,22 @@
1616
# See the License for the specific language governing permissions and
1717
# limitations under the License.
1818

19-
import sys
2019
import logging
2120
from unittest.mock import Mock, patch
2221

2322
import pytest
24-
from django.test import TestCase
2523
from requests.exceptions import ReadTimeout
2624

27-
from beeswax.api import _autocomplete, get_functions
25+
from beeswax.api import _autocomplete, _get_functions, _get_sample_data
2826
from desktop.lib.django_test_util import make_logged_in_client
29-
from desktop.lib.test_utils import add_to_group, grant_access
27+
from metastore.conf import ALLOW_SAMPLE_DATA_FROM_VIEWS
3028
from useradmin.models import User
3129

3230
LOG = logging.getLogger()
3331

3432

3533
@pytest.mark.django_db
36-
class TestApi():
37-
34+
class TestApi:
3835
def setup_method(self):
3936
self.client = make_logged_in_client(username="test", groupname="default", recreate=True, is_superuser=False)
4037
self.user = User.objects.get(username="test")
@@ -43,75 +40,99 @@ def test_autocomplete_time_out(self):
4340
get_tables_meta = Mock(
4441
side_effect=ReadTimeout("HTTPSConnectionPool(host='gethue.com', port=10001): Read timed out. (read timeout=120)")
4542
)
46-
db = Mock(
47-
get_tables_meta=get_tables_meta
48-
)
43+
db = Mock(get_tables_meta=get_tables_meta)
4944

50-
resp = _autocomplete(db, database='database')
45+
resp = _autocomplete(db, database="database")
5146

52-
assert (
53-
resp ==
54-
{
55-
'code': 500,
56-
'error': "HTTPSConnectionPool(host='gethue.com', port=10001): Read timed out. (read timeout=120)"
57-
})
47+
assert resp == {"code": 500, "error": "HTTPSConnectionPool(host='gethue.com', port=10001): Read timed out. (read timeout=120)"}
5848

5949
def test_get_functions(self):
60-
db = Mock(
61-
get_functions=Mock(
62-
return_value=Mock(
63-
rows=Mock(
64-
return_value=[{'name': 'f1'}, {'name': 'f2'}]
65-
)
66-
)
67-
)
68-
)
50+
# Mock db.get_functions() to return rows that escape_rows can process
51+
# Each row should be a list where row[0] is the function name
52+
db = Mock()
53+
db.get_functions = Mock(return_value=[["f1"], ["f2"]]) # Return list of rows
54+
db.client = Mock(query_server={"dialect": "hive"}) # Non-Impala dialect
6955

70-
resp = get_functions(db)
56+
resp = _get_functions(db) # Call the internal function
7157

72-
assert (
73-
resp ==
74-
[{'name': 'f1'}, {'name': 'f2'}])
58+
assert resp == [{"name": "f1"}, {"name": "f2"}]
7559

76-
def test_get_functions(self):
77-
with patch('beeswax.api._get_functions') as _get_functions:
60+
def test_autocomplete_functions(self):
61+
with patch("beeswax.api._get_functions") as _get_functions:
7862
db = Mock()
79-
_get_functions.return_value = [
80-
{'name': 'f1'}, {'name': 'f2'}, {'name': 'f3'}
81-
]
63+
_get_functions.return_value = [{"name": "f1"}, {"name": "f2"}, {"name": "f3"}]
8264

83-
resp = _autocomplete(db, database='default', operation='functions')
65+
resp = _autocomplete(db, database="default", operation="functions")
8466

85-
assert (
86-
resp['functions'] ==
87-
[{'name': 'f1'}, {'name': 'f2'}, {'name': 'f3'}])
67+
assert resp["functions"] == [{"name": "f1"}, {"name": "f2"}, {"name": "f3"}]
8868

8969
def test_get_function(self):
9070
db = Mock()
91-
db.client = Mock(query_server={'dialect': 'hive'})
71+
db.client = Mock(query_server={"dialect": "hive"})
9272
db.get_function = Mock(
9373
return_value=[
94-
['floor_month(param) - Returns the timestamp at a month granularity'],
95-
['param needs to be a timestamp value'],
96-
['Example:'],
74+
["floor_month(param) - Returns the timestamp at a month granularity"],
75+
["param needs to be a timestamp value"],
76+
["Example:"],
9777
["> SELECT floor_month(CAST('yyyy-MM-dd HH:mm:ss' AS TIMESTAMP)) FROM src;"],
98-
['yyyy-MM-01 00:00:00']
78+
["yyyy-MM-01 00:00:00"],
9979
]
10080
)
10181

102-
data = _autocomplete(db, database='floor_month', operation='function')
103-
104-
assert (
105-
data['function'] ==
106-
{
107-
'name': 'floor_month',
108-
'signature': 'floor_month(param)',
109-
'description':
110-
'Returns the timestamp at a month granularity\nparam needs to be a timestamp value\nExample:\n'
111-
'> SELECT floor_month(CAST(\'yyyy-MM-dd HH:mm:ss\' AS TIMESTAMP)) FROM src;\nyyyy-MM-01 00:00:00'
112-
})
113-
114-
db.client = Mock(query_server={'dialect': 'impala'})
115-
data = _autocomplete(db, operation='function')
116-
117-
assert data['function'] == {}
82+
data = _autocomplete(db, database="floor_month", operation="function")
83+
84+
assert data["function"] == {
85+
"name": "floor_month",
86+
"signature": "floor_month(param)",
87+
"description": "Returns the timestamp at a month granularity\nparam needs to be a timestamp value\nExample:\n"
88+
"> SELECT floor_month(CAST('yyyy-MM-dd HH:mm:ss' AS TIMESTAMP)) FROM src;\nyyyy-MM-01 00:00:00",
89+
}
90+
91+
db.client = Mock(query_server={"dialect": "impala"})
92+
data = _autocomplete(db, operation="function")
93+
94+
assert data["function"] == {}
95+
96+
@patch("beeswax.api.dbms.get")
97+
def test_get_sample_data_for_views(self, mock_dbms_get):
98+
# Mock table_obj
99+
table_obj_mock = Mock(is_view=True, is_impala_only=False)
100+
101+
# Mock the db object that dbms.get() would return
102+
db_mock = Mock(get_table=Mock(return_value=table_obj_mock))
103+
mock_dbms_get.return_value = db_mock
104+
105+
# Scenario 1: allow_sample_data_from_views is False
106+
reset = ALLOW_SAMPLE_DATA_FROM_VIEWS.set_for_testing(False)
107+
try:
108+
response = _get_sample_data(db_mock, "default_db", "test_view_table", None, None)
109+
110+
assert response == {
111+
"status": -1,
112+
"message": "Not getting sample data as this is a view which can be expensive when run.",
113+
}
114+
finally:
115+
reset()
116+
117+
# Scenario 2: allow_sample_data_from_views is True
118+
reset = ALLOW_SAMPLE_DATA_FROM_VIEWS.set_for_testing(True)
119+
try:
120+
# Mock db.get_sample to simulate successful data fetching past the view check
121+
# We expect it to be called if the view check is passed.
122+
db_mock.get_sample.return_value = Mock(
123+
rows=Mock(return_value=[["col1_val", "col2_val"]]),
124+
cols=Mock(return_value=["col1", "col2"]),
125+
full_cols=Mock(return_value=[{"name": "col1"}, {"name": "col2"}]),
126+
)
127+
mock_dbms_get.return_value = db_mock
128+
129+
response = _get_sample_data(db_mock, "default_db", "test_view_table", None, None)
130+
assert response == {
131+
"status": 0,
132+
"headers": ["col1", "col2"],
133+
"full_headers": [{"name": "col1"}, {"name": "col2"}],
134+
"rows": [["col1_val", "col2_val"]],
135+
}
136+
db_mock.get_sample.assert_called_once_with("default_db", table_obj_mock, None, None, generate_sql_only=False, operation=None)
137+
finally:
138+
reset()

apps/beeswax/src/beeswax/conf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,19 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
import math
1918
import logging
19+
import math
2020
import os.path
2121

22-
from django.utils.translation import gettext as _, gettext_lazy as _t
22+
from django.utils.translation import gettext_lazy as _t
2323

2424
from desktop.conf import (
2525
AUTH_PASSWORD as DEFAULT_AUTH_PASSWORD,
2626
AUTH_USERNAME as DEFAULT_AUTH_USERNAME,
2727
default_ssl_cacerts,
2828
default_ssl_validate,
2929
)
30-
from desktop.lib.conf import Config, ConfigSection, coerce_bool, coerce_csv, coerce_password_from_script
30+
from desktop.lib.conf import coerce_bool, coerce_csv, coerce_password_from_script, Config, ConfigSection
3131

3232
LOG = logging.getLogger()
3333

apps/metastore/src/metastore/conf.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from django.utils.translation import gettext_lazy as _
1919

20-
from desktop.lib.conf import Config, coerce_bool
20+
from desktop.lib.conf import coerce_bool, Config
2121

2222
ENABLE_NEW_CREATE_TABLE = Config(
2323
key="enable_new_create_table",
@@ -41,3 +41,10 @@
4141
type=coerce_bool,
4242
help=_('Choose whether to show the table ERD component.')
4343
)
44+
45+
ALLOW_SAMPLE_DATA_FROM_VIEWS = Config(
46+
key='allow_sample_data_from_views',
47+
default=False,
48+
type=coerce_bool,
49+
help=_('Choose whether to allow fetching sample data from views. By default, this is false to prevent potentially expensive queries.')
50+
)

desktop/conf.dist/hue.ini

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1578,6 +1578,10 @@ submit_to=True
15781578
# Choose whether to show the table ERD component. Default false
15791579
## show_table_erd=false
15801580

1581+
# Choose whether to allow fetching sample data from views.
1582+
# By default, this is false to prevent potentially expensive queries.
1583+
## allow_sample_data_from_views=false
1584+
15811585
###########################################################################
15821586
# Settings to configure Impala
15831587
###########################################################################

desktop/conf/pseudo-distributed.ini.tmpl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1547,7 +1547,6 @@
15471547
# Choose whether Hue should validate certificates received from the server.
15481548
## validate=true
15491549

1550-
15511550
###########################################################################
15521551
# Settings to configure Metastore
15531552
###########################################################################
@@ -1556,6 +1555,9 @@
15561555
# Flag to turn on the new version of the create table wizard.
15571556
## enable_new_create_table=true
15581557

1558+
# Allow fetching sample data from views. By default, this is false to prevent potentially expensive queries.
1559+
## allow_sample_data_from_views=false
1560+
15591561
# Flag to force all metadata calls (e.g. list tables, table or column details...) to happen via HiveServer2 if available instead of Impala.
15601562
## force_hs2_metadata=false
15611563

desktop/core/src/desktop/api2.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
search_entities_interactive as metadata_search_entities_interactive,
9393
)
9494
from metadata.conf import has_catalog
95+
from metastore.conf import ALLOW_SAMPLE_DATA_FROM_VIEWS
9596
from notebook.connectors.base import get_interpreter, Notebook
9697
from notebook.management.commands import notebook_setup
9798
from pig.management.commands import pig_setup
@@ -146,6 +147,7 @@ def get_config(request):
146147
'is_yarn_enabled': is_yarn(),
147148
'enable_task_server': TASK_SERVER_V2.ENABLED.get(),
148149
'enable_workflow_creation_action': ENABLE_WORKFLOW_CREATION_ACTION.get(),
150+
'allow_sample_data_from_views': ALLOW_SAMPLE_DATA_FROM_VIEWS.get(),
149151
}
150152

151153
# Storage browser configuration

desktop/core/src/desktop/js/catalog/DataCatalogEntry.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import * as ko from 'knockout';
1818
import KnockoutObservable from '@types/knockout';
1919

2020
import { Cancellable, CancellablePromise } from 'api/cancellablePromise';
21+
import { getLastKnownConfig } from 'config/hueConfig';
2122
import {
2223
addNavTags,
2324
deleteNavTags,
@@ -1662,7 +1663,8 @@ export default class DataCatalogEntry {
16621663
operation?: string;
16631664
}
16641665
): CancellablePromise<Sample> {
1665-
if (this.isView()) {
1666+
const config = getLastKnownConfig();
1667+
if (this.isView() && (!config || !config.hue_config?.allow_sample_data_from_views)) {
16661668
return CancellablePromise.reject();
16671669
}
16681670

desktop/core/src/desktop/js/config/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ export interface HueConfig extends GenericApiResponse {
9696
enable_task_server: boolean;
9797
is_admin: boolean;
9898
is_yarn_enabled: boolean;
99+
allow_sample_data_from_views: boolean;
99100
};
100101
storage_browser: StorageBrowserConfig;
101102
hue_version?: string;

0 commit comments

Comments
 (0)