# Importing Eland
# Python Client and Toolkit for DataFrames, Big Data, Machine Learning and ETL in Elasticsearch
# https://eland.readthedocs.io/en/latest/
# https://github.com/elastic/eland
import eland as ed
# https://elasticsearch-dsl.readthedocs.io/en/latest/
# https://github.com/elastic/elasticsearch-dsl-py
# High level Python client for Elasticsearch
from elasticsearch_dsl import Search, Q
# https://elasticsearch-py.readthedocs.io/en/latest/
# https://github.com/elastic/elasticsearch-py
# Official Python low-level client for Elasticsearch
from elasticsearch import Elasticsearch
# Import pandas and numpy for data wrangling
import pandas as pd
import numpy as np
# For pretty-printing
# Function for pretty-printing JSON
def json(raw):
import json
print(json.dumps(raw, indent=2, sort_keys=True))# Connect to an Elasticsearch instance
# here we use the official Elastic Python client
# check it on https://github.com/elastic/elasticsearch-py
es = Elasticsearch(
['http://localhost:9200'],
http_auth=("es_kbn", "changeme")
)
# print the connection object info (same as visiting http://localhost:9200)
# make sure your elasticsearch node/cluster respond to requests
json(es.info()){
"cluster_name": "churn",
"cluster_uuid": "K3nB4fp_QcyjpY-e2XVUbA",
"name": "node-01",
"tagline": "You Know, for Search",
"version": {
"build_date": "2020-07-22T19:31:37.655268Z",
"build_flavor": "default",
"build_hash": "bbbd2282a6668869c41efc5713ad8214d44c0ad1",
"build_snapshot": true,
"build_type": "zip",
"lucene_version": "8.6.0",
"minimum_index_compatibility_version": "7.0.0",
"minimum_wire_compatibility_version": "7.10.0",
"number": "8.0.0-SNAPSHOT"
}
}
# name of the index we want to query
index_name = 'kibana_sample_data_ecommerce'
# defining the search statement to get all records in an index
search = Search(using=es, index=index_name).query("match_all")
# retrieving the documents from the search
documents = [hit.to_dict() for hit in search.scan()]
# converting the list of hit dictionaries into a pandas dataframe:
df_ecommerce = pd.DataFrame.from_records(documents)# visualizing the dataframe with the results:
df_ecommerce.head()['geoip']0 {'country_iso_code': 'EG', 'location': {'lon':...
1 {'country_iso_code': 'AE', 'location': {'lon':...
2 {'country_iso_code': 'US', 'location': {'lon':...
3 {'country_iso_code': 'GB', 'location': {'lon':...
4 {'country_iso_code': 'EG', 'location': {'lon':...
Name: geoip, dtype: object
# retrieving a summary of the columns in the dataset:
df_ecommerce.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4675 entries, 0 to 4674
Data columns (total 24 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 category 4675 non-null object
1 currency 4675 non-null object
2 customer_first_name 4675 non-null object
3 customer_full_name 4675 non-null object
4 customer_gender 4675 non-null object
5 customer_id 4675 non-null int64
6 customer_last_name 4675 non-null object
7 customer_phone 4675 non-null object
8 day_of_week 4675 non-null object
9 day_of_week_i 4675 non-null int64
10 email 4675 non-null object
11 manufacturer 4675 non-null object
12 order_date 4675 non-null object
13 order_id 4675 non-null int64
14 products 4675 non-null object
15 sku 4675 non-null object
16 taxful_total_price 4675 non-null float64
17 taxless_total_price 4675 non-null float64
18 total_quantity 4675 non-null int64
19 total_unique_products 4675 non-null int64
20 type 4675 non-null object
21 user 4675 non-null object
22 geoip 4675 non-null object
23 event 4675 non-null object
dtypes: float64(2), int64(5), object(17)
memory usage: 876.7+ KB
# getting descriptive statistics from the dataframe
df_ecommerce.describe()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
| customer_id | day_of_week_i | order_id | taxful_total_price | taxless_total_price | total_quantity | total_unique_products | |
|---|---|---|---|---|---|---|---|
| count | 4675.000000 | 4675.000000 | 4675.000000 | 4675.000000 | 4675.000000 | 4675.000000 | 4675.000000 |
| mean | 27.465668 | 3.114866 | 587280.693690 | 75.050062 | 75.050062 | 2.158503 | 2.157647 |
| std | 13.946711 | 1.930395 | 47891.081603 | 52.793929 | 52.793929 | 0.594071 | 0.588510 |
| min | 4.000000 | 0.000000 | 550375.000000 | 6.990000 | 6.990000 | 1.000000 | 1.000000 |
| 25% | 17.000000 | 1.000000 | 562044.000000 | 44.980000 | 44.980000 | 2.000000 | 2.000000 |
| 50% | 27.000000 | 3.000000 | 573671.000000 | 63.980000 | 63.980000 | 2.000000 | 2.000000 |
| 75% | 41.000000 | 5.000000 | 585415.000000 | 92.980000 | 92.980000 | 2.000000 | 2.000000 |
| max | 52.000000 | 6.000000 | 740002.000000 | 2249.920000 | 2249.920000 | 8.000000 | 4.000000 |
# loading the data from the Sample Ecommerce data from Kibana into Eland dataframe:
ed_ecommerce = ed.read_es(es, index_name)
# visualizing the results:
ed_ecommerce.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
| category | currency | customer_birth_date | customer_first_name | customer_full_name | customer_gender | customer_id | customer_last_name | customer_phone | day_of_week | ... | products.taxful_price | products.taxless_price | products.unit_discount_amount | sku | taxful_total_price | taxless_total_price | total_quantity | total_unique_products | type | user | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Fy_X0nMBxx5L21Ced97s | [Men's Clothing] | EUR | NaT | Eddie | Eddie Underwood | MALE | 38 | Underwood | Monday | ... | [11.99, 24.99] | [11.99, 24.99] | [0, 0] | [ZO0549605496, ZO0299602996] | 36.98 | 36.98 | 2 | 2 | order | eddie | |
| GC_X0nMBxx5L21Ced97t | [Women's Clothing] | EUR | NaT | Mary | Mary Bailey | FEMALE | 20 | Bailey | Sunday | ... | [24.99, 28.99] | [24.99, 28.99] | [0, 0] | [ZO0489604896, ZO0185501855] | 53.98 | 53.98 | 2 | 2 | order | mary | |
| GS_X0nMBxx5L21Ced97t | [Women's Shoes, Women's Clothing] | EUR | NaT | Gwen | Gwen Butler | FEMALE | 26 | Butler | Sunday | ... | [99.99, 99.99] | [99.99, 99.99] | [0, 0] | [ZO0374603746, ZO0272202722] | 199.98 | 199.98 | 2 | 2 | order | gwen | |
| Gi_X0nMBxx5L21Ced97t | [Women's Shoes, Women's Clothing] | EUR | NaT | Diane | Diane Chandler | FEMALE | 22 | Chandler | Sunday | ... | [74.99, 99.99] | [74.99, 99.99] | [0, 0] | [ZO0360303603, ZO0272002720] | 174.98 | 174.98 | 2 | 2 | order | diane | |
| Gy_X0nMBxx5L21Ced97t | [Men's Clothing, Men's Accessories] | EUR | NaT | Eddie | Eddie Weber | MALE | 38 | Weber | Monday | ... | [59.99, 20.99] | [59.99, 20.99] | [0, 0] | [ZO0542505425, ZO0601306013] | 80.98 | 80.98 | 2 | 2 | order | eddie |
5 rows × 46 columns
# retrieving a summary of the columns in the dataframe:
ed_ecommerce.info()<class 'eland.dataframe.DataFrame'>
Index: 4675 entries, Fy_X0nMBxx5L21Ced97s to XC_X0nMBxx5L21CejPB3
Data columns (total 46 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 category 4675 non-null object
1 currency 4675 non-null object
2 customer_birth_date 0 non-null datetime64[ns]
3 customer_first_name 4675 non-null object
4 customer_full_name 4675 non-null object
5 customer_gender 4675 non-null object
6 customer_id 4675 non-null object
7 customer_last_name 4675 non-null object
8 customer_phone 4675 non-null object
9 day_of_week 4675 non-null object
10 day_of_week_i 4675 non-null int64
11 email 4675 non-null object
12 event.dataset 4675 non-null object
13 geoip.city_name 4094 non-null object
14 geoip.continent_name 4675 non-null object
15 geoip.country_iso_code 4675 non-null object
16 geoip.location 4675 non-null object
17 geoip.region_name 3924 non-null object
18 manufacturer 4675 non-null object
19 order_date 4675 non-null datetime64[ns]
20 order_id 4675 non-null object
21 products._id 4675 non-null object
22 products.base_price 4675 non-null float64
23 products.base_unit_price 4675 non-null float64
24 products.category 4675 non-null object
25 products.created_on 4675 non-null datetime64[ns]
26 products.discount_amount 4675 non-null float64
27 products.discount_percentage 4675 non-null float64
28 products.manufacturer 4675 non-null object
29 products.min_price 4675 non-null float64
30 products.price 4675 non-null float64
31 products.product_id 4675 non-null int64
32 products.product_name 4675 non-null object
33 products.quantity 4675 non-null int64
34 products.sku 4675 non-null object
35 products.tax_amount 4675 non-null float64
36 products.taxful_price 4675 non-null float64
37 products.taxless_price 4675 non-null float64
38 products.unit_discount_amount 4675 non-null float64
39 sku 4675 non-null object
40 taxful_total_price 4675 non-null float64
41 taxless_total_price 4675 non-null float64
42 total_quantity 4675 non-null int64
43 total_unique_products 4675 non-null int64
44 type 4675 non-null object
45 user 4675 non-null object
dtypes: datetime64[ns](3), float64(12), int64(5), object(26)
memory usage: 64.0 bytes
# getting the dtypes from pandas dataframe:
df_ecommerce.dtypescategory object
currency object
customer_first_name object
customer_full_name object
customer_gender object
customer_id int64
customer_last_name object
customer_phone object
day_of_week object
day_of_week_i int64
email object
manufacturer object
order_date object
order_id int64
products object
sku object
taxful_total_price float64
taxless_total_price float64
total_quantity int64
total_unique_products int64
type object
user object
geoip object
event object
dtype: object
# retrieving the Data types for the index normally would require us to perform the following Elasticsearch query:
mapping = es.indices.get_mapping(index_name)
# which by itself is an abstraction of the GET request for mapping retrieval
json(mapping){
"kibana_sample_data_ecommerce": {
"mappings": {
"properties": {
"category": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"currency": {
"type": "keyword"
},
"customer_birth_date": {
"type": "date"
},
"customer_first_name": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"customer_full_name": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"customer_gender": {
"type": "keyword"
},
"customer_id": {
"type": "keyword"
},
"customer_last_name": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"customer_phone": {
"type": "keyword"
},
"day_of_week": {
"type": "keyword"
},
"day_of_week_i": {
"type": "integer"
},
"email": {
"type": "keyword"
},
"event": {
"properties": {
"dataset": {
"type": "keyword"
}
}
},
"geoip": {
"properties": {
"city_name": {
"type": "keyword"
},
"continent_name": {
"type": "keyword"
},
"country_iso_code": {
"type": "keyword"
},
"location": {
"type": "geo_point"
},
"region_name": {
"type": "keyword"
}
}
},
"manufacturer": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"order_date": {
"type": "date"
},
"order_id": {
"type": "keyword"
},
"products": {
"properties": {
"_id": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"base_price": {
"type": "half_float"
},
"base_unit_price": {
"type": "half_float"
},
"category": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"created_on": {
"type": "date"
},
"discount_amount": {
"type": "half_float"
},
"discount_percentage": {
"type": "half_float"
},
"manufacturer": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"min_price": {
"type": "half_float"
},
"price": {
"type": "half_float"
},
"product_id": {
"type": "long"
},
"product_name": {
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"quantity": {
"type": "integer"
},
"sku": {
"type": "keyword"
},
"tax_amount": {
"type": "half_float"
},
"taxful_price": {
"type": "half_float"
},
"taxless_price": {
"type": "half_float"
},
"unit_discount_amount": {
"type": "half_float"
}
}
},
"sku": {
"type": "keyword"
},
"taxful_total_price": {
"type": "half_float"
},
"taxless_total_price": {
"type": "half_float"
},
"total_quantity": {
"type": "integer"
},
"total_unique_products": {
"type": "integer"
},
"type": {
"type": "keyword"
},
"user": {
"type": "keyword"
}
}
}
}
}
# Eland abstracts this procedure into the same pandas api:
ed_ecommerce.dtypescategory object
currency object
customer_birth_date datetime64[ns]
customer_first_name object
customer_full_name object
customer_gender object
customer_id object
customer_last_name object
customer_phone object
day_of_week object
day_of_week_i int64
email object
event.dataset object
geoip.city_name object
geoip.continent_name object
geoip.country_iso_code object
geoip.location object
geoip.region_name object
manufacturer object
order_date datetime64[ns]
order_id object
products._id object
products.base_price float64
products.base_unit_price float64
products.category object
products.created_on datetime64[ns]
products.discount_amount float64
products.discount_percentage float64
products.manufacturer object
products.min_price float64
products.price float64
products.product_id int64
products.product_name object
products.quantity int64
products.sku object
products.tax_amount float64
products.taxful_price float64
products.taxless_price float64
products.unit_discount_amount float64
sku object
taxful_total_price float64
taxless_total_price float64
total_quantity int64
total_unique_products int64
type object
user object
dtype: object
# defining the full-text query we need: Retrieving records for either Elitelligence or Primemaster manufacturer
query = {
"query_string" : {
"fields" : ["manufacturer"],
"query" : "Elitelligence OR Primemaster"
}
}
# using full-text search capabilities with Eland:
text_search_df = ed_ecommerce.es_query(query)
# visualizing price of products for each manufacturer using pandas column syntax:
text_search_df[['manufacturer','products.price']]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
| manufacturer | products.price | |
|---|---|---|
| Fy_X0nMBxx5L21Ced97s | [Elitelligence, Oceanavigations] | [11.99, 24.99] |
| Gi_X0nMBxx5L21Ced97t | [Primemaster, Oceanavigations] | [74.99, 99.99] |
| Gy_X0nMBxx5L21Ced97t | [Elitelligence] | [59.99, 20.99] |
| HS_X0nMBxx5L21Ced97t | [Low Tide Media, Elitelligence] | [20.99, 24.99] |
| Hi_X0nMBxx5L21Ced97t | [Low Tide Media, Oceanavigations, Elitelligence] | [28.99, 41.99, 59.99, 7.99] |
| ... | ... | ... |
| SC_X0nMBxx5L21CejPB3 | [Elitelligence] | [59.99, 18.99] |
| Sy_X0nMBxx5L21CejPB3 | [Low Tide Media, Elitelligence] | [24.99, 41.99] |
| Ty_X0nMBxx5L21CejPB3 | [Elitelligence] | [24.99, 25.99] |
| US_X0nMBxx5L21CejPB3 | [Angeldale, Elitelligence] | [79.99, 10.99] |
| VS_X0nMBxx5L21CejPB3 | [Elitelligence, Low Tide Media] | [7.99, 59.99] |
1435 rows × 2 columns