Skip to content

synapticielfactory/eland_es

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

2 Commits
 
 
 
 

Repository files navigation

# Importing Eland
# Python Client and Toolkit for DataFrames, Big Data, Machine Learning and ETL in Elasticsearch
# https://eland.readthedocs.io/en/latest/
# https://github.com/elastic/eland
import eland as ed

# https://elasticsearch-dsl.readthedocs.io/en/latest/
# https://github.com/elastic/elasticsearch-dsl-py
# High level Python client for Elasticsearch
from elasticsearch_dsl import Search, Q

# https://elasticsearch-py.readthedocs.io/en/latest/
# https://github.com/elastic/elasticsearch-py
# Official Python low-level client for Elasticsearch
from elasticsearch import Elasticsearch

# Import pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# For pretty-printing
# Function for pretty-printing JSON
def json(raw):
    import json
    print(json.dumps(raw, indent=2, sort_keys=True))
# Connect to an Elasticsearch instance
# here we use the official Elastic Python client
# check it on https://github.com/elastic/elasticsearch-py
es = Elasticsearch(
  ['http://localhost:9200'],
  http_auth=("es_kbn", "changeme")
)
# print the connection object info (same as visiting http://localhost:9200)
# make sure your elasticsearch node/cluster respond to requests
json(es.info())
{
  "cluster_name": "churn",
  "cluster_uuid": "K3nB4fp_QcyjpY-e2XVUbA",
  "name": "node-01",
  "tagline": "You Know, for Search",
  "version": {
    "build_date": "2020-07-22T19:31:37.655268Z",
    "build_flavor": "default",
    "build_hash": "bbbd2282a6668869c41efc5713ad8214d44c0ad1",
    "build_snapshot": true,
    "build_type": "zip",
    "lucene_version": "8.6.0",
    "minimum_index_compatibility_version": "7.0.0",
    "minimum_wire_compatibility_version": "7.10.0",
    "number": "8.0.0-SNAPSHOT"
  }
}
# name of the index we want to query
index_name = 'kibana_sample_data_ecommerce' 

# defining the search statement to get all records in an index
search = Search(using=es, index=index_name).query("match_all") 

# retrieving the documents from the search
documents = [hit.to_dict() for hit in search.scan()] 

# converting the list of hit dictionaries into a pandas dataframe:
df_ecommerce = pd.DataFrame.from_records(documents)
# visualizing the dataframe with the results:
df_ecommerce.head()['geoip']
0    {'country_iso_code': 'EG', 'location': {'lon':...
1    {'country_iso_code': 'AE', 'location': {'lon':...
2    {'country_iso_code': 'US', 'location': {'lon':...
3    {'country_iso_code': 'GB', 'location': {'lon':...
4    {'country_iso_code': 'EG', 'location': {'lon':...
Name: geoip, dtype: object
# retrieving a summary of the columns in the dataset:
df_ecommerce.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4675 entries, 0 to 4674
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   category               4675 non-null   object 
 1   currency               4675 non-null   object 
 2   customer_first_name    4675 non-null   object 
 3   customer_full_name     4675 non-null   object 
 4   customer_gender        4675 non-null   object 
 5   customer_id            4675 non-null   int64  
 6   customer_last_name     4675 non-null   object 
 7   customer_phone         4675 non-null   object 
 8   day_of_week            4675 non-null   object 
 9   day_of_week_i          4675 non-null   int64  
 10  email                  4675 non-null   object 
 11  manufacturer           4675 non-null   object 
 12  order_date             4675 non-null   object 
 13  order_id               4675 non-null   int64  
 14  products               4675 non-null   object 
 15  sku                    4675 non-null   object 
 16  taxful_total_price     4675 non-null   float64
 17  taxless_total_price    4675 non-null   float64
 18  total_quantity         4675 non-null   int64  
 19  total_unique_products  4675 non-null   int64  
 20  type                   4675 non-null   object 
 21  user                   4675 non-null   object 
 22  geoip                  4675 non-null   object 
 23  event                  4675 non-null   object 
dtypes: float64(2), int64(5), object(17)
memory usage: 876.7+ KB
# getting descriptive statistics from the dataframe
df_ecommerce.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
customer_id day_of_week_i order_id taxful_total_price taxless_total_price total_quantity total_unique_products
count 4675.000000 4675.000000 4675.000000 4675.000000 4675.000000 4675.000000 4675.000000
mean 27.465668 3.114866 587280.693690 75.050062 75.050062 2.158503 2.157647
std 13.946711 1.930395 47891.081603 52.793929 52.793929 0.594071 0.588510
min 4.000000 0.000000 550375.000000 6.990000 6.990000 1.000000 1.000000
25% 17.000000 1.000000 562044.000000 44.980000 44.980000 2.000000 2.000000
50% 27.000000 3.000000 573671.000000 63.980000 63.980000 2.000000 2.000000
75% 41.000000 5.000000 585415.000000 92.980000 92.980000 2.000000 2.000000
max 52.000000 6.000000 740002.000000 2249.920000 2249.920000 8.000000 4.000000
# loading the data from the Sample Ecommerce data from Kibana into Eland dataframe:
ed_ecommerce = ed.read_es(es, index_name)
# visualizing the results:
ed_ecommerce.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
category currency customer_birth_date customer_first_name customer_full_name customer_gender customer_id customer_last_name customer_phone day_of_week ... products.taxful_price products.taxless_price products.unit_discount_amount sku taxful_total_price taxless_total_price total_quantity total_unique_products type user
Fy_X0nMBxx5L21Ced97s [Men's Clothing] EUR NaT Eddie Eddie Underwood MALE 38 Underwood Monday ... [11.99, 24.99] [11.99, 24.99] [0, 0] [ZO0549605496, ZO0299602996] 36.98 36.98 2 2 order eddie
GC_X0nMBxx5L21Ced97t [Women's Clothing] EUR NaT Mary Mary Bailey FEMALE 20 Bailey Sunday ... [24.99, 28.99] [24.99, 28.99] [0, 0] [ZO0489604896, ZO0185501855] 53.98 53.98 2 2 order mary
GS_X0nMBxx5L21Ced97t [Women's Shoes, Women's Clothing] EUR NaT Gwen Gwen Butler FEMALE 26 Butler Sunday ... [99.99, 99.99] [99.99, 99.99] [0, 0] [ZO0374603746, ZO0272202722] 199.98 199.98 2 2 order gwen
Gi_X0nMBxx5L21Ced97t [Women's Shoes, Women's Clothing] EUR NaT Diane Diane Chandler FEMALE 22 Chandler Sunday ... [74.99, 99.99] [74.99, 99.99] [0, 0] [ZO0360303603, ZO0272002720] 174.98 174.98 2 2 order diane
Gy_X0nMBxx5L21Ced97t [Men's Clothing, Men's Accessories] EUR NaT Eddie Eddie Weber MALE 38 Weber Monday ... [59.99, 20.99] [59.99, 20.99] [0, 0] [ZO0542505425, ZO0601306013] 80.98 80.98 2 2 order eddie

5 rows × 46 columns

# retrieving a summary of the columns in the dataframe:
ed_ecommerce.info()
<class 'eland.dataframe.DataFrame'>
Index: 4675 entries, Fy_X0nMBxx5L21Ced97s to XC_X0nMBxx5L21CejPB3
Data columns (total 46 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   category                       4675 non-null   object        
 1   currency                       4675 non-null   object        
 2   customer_birth_date            0 non-null      datetime64[ns]
 3   customer_first_name            4675 non-null   object        
 4   customer_full_name             4675 non-null   object        
 5   customer_gender                4675 non-null   object        
 6   customer_id                    4675 non-null   object        
 7   customer_last_name             4675 non-null   object        
 8   customer_phone                 4675 non-null   object        
 9   day_of_week                    4675 non-null   object        
 10  day_of_week_i                  4675 non-null   int64         
 11  email                          4675 non-null   object        
 12  event.dataset                  4675 non-null   object        
 13  geoip.city_name                4094 non-null   object        
 14  geoip.continent_name           4675 non-null   object        
 15  geoip.country_iso_code         4675 non-null   object        
 16  geoip.location                 4675 non-null   object        
 17  geoip.region_name              3924 non-null   object        
 18  manufacturer                   4675 non-null   object        
 19  order_date                     4675 non-null   datetime64[ns]
 20  order_id                       4675 non-null   object        
 21  products._id                   4675 non-null   object        
 22  products.base_price            4675 non-null   float64       
 23  products.base_unit_price       4675 non-null   float64       
 24  products.category              4675 non-null   object        
 25  products.created_on            4675 non-null   datetime64[ns]
 26  products.discount_amount       4675 non-null   float64       
 27  products.discount_percentage   4675 non-null   float64       
 28  products.manufacturer          4675 non-null   object        
 29  products.min_price             4675 non-null   float64       
 30  products.price                 4675 non-null   float64       
 31  products.product_id            4675 non-null   int64         
 32  products.product_name          4675 non-null   object        
 33  products.quantity              4675 non-null   int64         
 34  products.sku                   4675 non-null   object        
 35  products.tax_amount            4675 non-null   float64       
 36  products.taxful_price          4675 non-null   float64       
 37  products.taxless_price         4675 non-null   float64       
 38  products.unit_discount_amount  4675 non-null   float64       
 39  sku                            4675 non-null   object        
 40  taxful_total_price             4675 non-null   float64       
 41  taxless_total_price            4675 non-null   float64       
 42  total_quantity                 4675 non-null   int64         
 43  total_unique_products          4675 non-null   int64         
 44  type                           4675 non-null   object        
 45  user                           4675 non-null   object        
dtypes: datetime64[ns](3), float64(12), int64(5), object(26)
memory usage: 64.0 bytes
# getting the dtypes from pandas dataframe:
df_ecommerce.dtypes
category                  object
currency                  object
customer_first_name       object
customer_full_name        object
customer_gender           object
customer_id                int64
customer_last_name        object
customer_phone            object
day_of_week               object
day_of_week_i              int64
email                     object
manufacturer              object
order_date                object
order_id                   int64
products                  object
sku                       object
taxful_total_price       float64
taxless_total_price      float64
total_quantity             int64
total_unique_products      int64
type                      object
user                      object
geoip                     object
event                     object
dtype: object
# retrieving the Data types for the index normally would require us to perform the following Elasticsearch query:
mapping = es.indices.get_mapping(index_name) 
# which by itself is an abstraction of the GET request for mapping retrieval
json(mapping)
{
  "kibana_sample_data_ecommerce": {
    "mappings": {
      "properties": {
        "category": {
          "fields": {
            "keyword": {
              "type": "keyword"
            }
          },
          "type": "text"
        },
        "currency": {
          "type": "keyword"
        },
        "customer_birth_date": {
          "type": "date"
        },
        "customer_first_name": {
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          },
          "type": "text"
        },
        "customer_full_name": {
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          },
          "type": "text"
        },
        "customer_gender": {
          "type": "keyword"
        },
        "customer_id": {
          "type": "keyword"
        },
        "customer_last_name": {
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          },
          "type": "text"
        },
        "customer_phone": {
          "type": "keyword"
        },
        "day_of_week": {
          "type": "keyword"
        },
        "day_of_week_i": {
          "type": "integer"
        },
        "email": {
          "type": "keyword"
        },
        "event": {
          "properties": {
            "dataset": {
              "type": "keyword"
            }
          }
        },
        "geoip": {
          "properties": {
            "city_name": {
              "type": "keyword"
            },
            "continent_name": {
              "type": "keyword"
            },
            "country_iso_code": {
              "type": "keyword"
            },
            "location": {
              "type": "geo_point"
            },
            "region_name": {
              "type": "keyword"
            }
          }
        },
        "manufacturer": {
          "fields": {
            "keyword": {
              "type": "keyword"
            }
          },
          "type": "text"
        },
        "order_date": {
          "type": "date"
        },
        "order_id": {
          "type": "keyword"
        },
        "products": {
          "properties": {
            "_id": {
              "fields": {
                "keyword": {
                  "ignore_above": 256,
                  "type": "keyword"
                }
              },
              "type": "text"
            },
            "base_price": {
              "type": "half_float"
            },
            "base_unit_price": {
              "type": "half_float"
            },
            "category": {
              "fields": {
                "keyword": {
                  "type": "keyword"
                }
              },
              "type": "text"
            },
            "created_on": {
              "type": "date"
            },
            "discount_amount": {
              "type": "half_float"
            },
            "discount_percentage": {
              "type": "half_float"
            },
            "manufacturer": {
              "fields": {
                "keyword": {
                  "type": "keyword"
                }
              },
              "type": "text"
            },
            "min_price": {
              "type": "half_float"
            },
            "price": {
              "type": "half_float"
            },
            "product_id": {
              "type": "long"
            },
            "product_name": {
              "analyzer": "english",
              "fields": {
                "keyword": {
                  "type": "keyword"
                }
              },
              "type": "text"
            },
            "quantity": {
              "type": "integer"
            },
            "sku": {
              "type": "keyword"
            },
            "tax_amount": {
              "type": "half_float"
            },
            "taxful_price": {
              "type": "half_float"
            },
            "taxless_price": {
              "type": "half_float"
            },
            "unit_discount_amount": {
              "type": "half_float"
            }
          }
        },
        "sku": {
          "type": "keyword"
        },
        "taxful_total_price": {
          "type": "half_float"
        },
        "taxless_total_price": {
          "type": "half_float"
        },
        "total_quantity": {
          "type": "integer"
        },
        "total_unique_products": {
          "type": "integer"
        },
        "type": {
          "type": "keyword"
        },
        "user": {
          "type": "keyword"
        }
      }
    }
  }
}
# Eland abstracts this procedure into the same pandas api:
ed_ecommerce.dtypes
category                                 object
currency                                 object
customer_birth_date              datetime64[ns]
customer_first_name                      object
customer_full_name                       object
customer_gender                          object
customer_id                              object
customer_last_name                       object
customer_phone                           object
day_of_week                              object
day_of_week_i                             int64
email                                    object
event.dataset                            object
geoip.city_name                          object
geoip.continent_name                     object
geoip.country_iso_code                   object
geoip.location                           object
geoip.region_name                        object
manufacturer                             object
order_date                       datetime64[ns]
order_id                                 object
products._id                             object
products.base_price                     float64
products.base_unit_price                float64
products.category                        object
products.created_on              datetime64[ns]
products.discount_amount                float64
products.discount_percentage            float64
products.manufacturer                    object
products.min_price                      float64
products.price                          float64
products.product_id                       int64
products.product_name                    object
products.quantity                         int64
products.sku                             object
products.tax_amount                     float64
products.taxful_price                   float64
products.taxless_price                  float64
products.unit_discount_amount           float64
sku                                      object
taxful_total_price                      float64
taxless_total_price                     float64
total_quantity                            int64
total_unique_products                     int64
type                                     object
user                                     object
dtype: object
# defining the full-text query we need: Retrieving records for either Elitelligence or Primemaster manufacturer
query = {
        "query_string" : {
            "fields" : ["manufacturer"],
            "query" : "Elitelligence OR Primemaster"
        }
    }
# using full-text search capabilities with Eland:
text_search_df = ed_ecommerce.es_query(query)

# visualizing price of products for each manufacturer using pandas column syntax:
text_search_df[['manufacturer','products.price']]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
manufacturer products.price
Fy_X0nMBxx5L21Ced97s [Elitelligence, Oceanavigations] [11.99, 24.99]
Gi_X0nMBxx5L21Ced97t [Primemaster, Oceanavigations] [74.99, 99.99]
Gy_X0nMBxx5L21Ced97t [Elitelligence] [59.99, 20.99]
HS_X0nMBxx5L21Ced97t [Low Tide Media, Elitelligence] [20.99, 24.99]
Hi_X0nMBxx5L21Ced97t [Low Tide Media, Oceanavigations, Elitelligence] [28.99, 41.99, 59.99, 7.99]
... ... ...
SC_X0nMBxx5L21CejPB3 [Elitelligence] [59.99, 18.99]
Sy_X0nMBxx5L21CejPB3 [Low Tide Media, Elitelligence] [24.99, 41.99]
Ty_X0nMBxx5L21CejPB3 [Elitelligence] [24.99, 25.99]
US_X0nMBxx5L21CejPB3 [Angeldale, Elitelligence] [79.99, 10.99]
VS_X0nMBxx5L21CejPB3 [Elitelligence, Low Tide Media] [7.99, 59.99]

1435 rows × 2 columns

About

Elasticsearch for Data Science just got way easier

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published