|
5 | 5 |
|
6 | 6 | ## Busca vetorial
|
7 | 7 |
|
8 |
| -... |
| 8 | +### text embedding |
| 9 | +```http |
| 10 | +
|
| 11 | +POST _ml/trained_models/sentence-transformers__msmarco-minilm-l-12-v3/_infer |
| 12 | +{ |
| 13 | + "docs": [ |
| 14 | + { |
| 15 | + "text_field": "eu vou no meetup de elastic em blumenau." |
| 16 | + } |
| 17 | + ] |
| 18 | +} |
| 19 | +
|
| 20 | +``` |
| 21 | + |
| 22 | +### spark-shell para gerar um dataset qualquer |
| 23 | +- https://www.rtancman.com.br/elasticsearch/elasticsearch-spark-ingestao-de-dados.html |
| 24 | +- https://www.kaggle.com/datasets/brunoalarcon123/top-200-spotify-songs-dataset |
| 25 | + |
| 26 | +```bash |
| 27 | +spark-shell |
| 28 | +``` |
| 29 | + |
| 30 | +```scala |
| 31 | +import org.apache.spark.sql.functions._ |
| 32 | + |
| 33 | +val df = spark.read.option("header",true).options(Map("delimiter"->";")).csv("/Users/rtancman/Downloads/Spotify_Dataset_V3.csv") |
| 34 | + |
| 35 | +val df2 = df.select(concat(df("Artists"),lit(" "),df("Title"),lit(" "),df("Nationality")).as("text")) |
| 36 | +df2.show() |
| 37 | + |
| 38 | +df2.write.option("header","false").csv("/Users/rtancman/Downloads/elastic-bnu-texts.csv") |
| 39 | +``` |
| 40 | + |
| 41 | +### ingestion pipeline |
| 42 | + |
| 43 | +```http |
| 44 | +
|
| 45 | +PUT _ingest/pipeline/text-embeddings |
| 46 | +{ |
| 47 | + "description": "Text embedding pipeline", |
| 48 | + "processors": [ |
| 49 | + { |
| 50 | + "inference": { |
| 51 | + "model_id": "sentence-transformers__msmarco-minilm-l-12-v3", |
| 52 | + "target_field": "text_embedding", |
| 53 | + "field_map": { |
| 54 | + "text": "text_field" |
| 55 | + } |
| 56 | + } |
| 57 | + } |
| 58 | + ], |
| 59 | + "on_failure": [ |
| 60 | + { |
| 61 | + "set": { |
| 62 | + "description": "Index document to 'failed-<index>'", |
| 63 | + "field": "_index", |
| 64 | + "value": "failed-{{{_index}}}" |
| 65 | + } |
| 66 | + }, |
| 67 | + { |
| 68 | + "set": { |
| 69 | + "description": "Set error message", |
| 70 | + "field": "ingest.failure", |
| 71 | + "value": "{{_ingest.on_failure_message}}" |
| 72 | + } |
| 73 | + } |
| 74 | + ] |
| 75 | +} |
| 76 | +
|
| 77 | +PUT elastic-bnu |
| 78 | +{ |
| 79 | + "mappings": { |
| 80 | + "properties": { |
| 81 | + "text": { |
| 82 | + "type": "text" |
| 83 | + } |
| 84 | + } |
| 85 | + } |
| 86 | +} |
| 87 | +
|
| 88 | +PUT elastic-bnu-with-embeddings |
| 89 | +{ |
| 90 | + "mappings": { |
| 91 | + "properties": { |
| 92 | + "text_embedding.predicted_value": { |
| 93 | + "type": "dense_vector", |
| 94 | + "dims": 384, |
| 95 | + "index": true, |
| 96 | + "similarity": "cosine" |
| 97 | + }, |
| 98 | + "text": { |
| 99 | + "type": "text" |
| 100 | + } |
| 101 | + } |
| 102 | + } |
| 103 | +} |
| 104 | +
|
| 105 | +POST /elastic-bnu/_bulk |
| 106 | +{"index":{}} |
| 107 | +{"text":"Lana Del Rey Summertime Sadness United States"} |
| 108 | +{"index":{}} |
| 109 | +{"text":"Eliza Rose, Interplanetary Criminal B.O.T.A. (Baddest Of Them All) - Edit United Kingdom"} |
| 110 | +{"index":{}} |
| 111 | +{"text":"Eliza Rose, Interplanetary Criminal B.O.T.A. (Baddest Of Them All) - Edit United Kingdom"} |
| 112 | +{"index":{}} |
| 113 | +{"text":"Lost Frequencies, Calum Scott Where Are You Now Belgium"} |
| 114 | +{"index":{}} |
| 115 | +{"text":"Lost Frequencies, Calum Scott Where Are You Now United Kingdom"} |
| 116 | +{"index":{}} |
| 117 | +{"text":"Paulo Londra, Feid A Veces (feat. Feid) Argentina"} |
| 118 | +{"index":{}} |
| 119 | +{"text":"Paulo Londra, Feid A Veces (feat. Feid) Colombia"} |
| 120 | +{"index":{}} |
| 121 | +{"text":"Jhayco, Feid, Sech En La De Ella Puerto Rico"} |
| 122 | +{"index":{}} |
| 123 | +{"text":"Jhayco, Feid, Sech En La De Ella Colombia"} |
| 124 | +{"index":{}} |
| 125 | +{"text":"Jhayco, Feid, Sech En La De Ella Panama"} |
| 126 | +{"index":{}} |
| 127 | +{"text":"Maroon 5, Wiz Khalifa Payphone United States"} |
| 128 | +{"index":{}} |
| 129 | +{"text":"Maroon 5, Wiz Khalifa Payphone United States"} |
| 130 | +{"index":{}} |
| 131 | +{"text":"Lewis Capaldi Forget Me United Kingdom"} |
| 132 | +{"index":{}} |
| 133 | +{"text":"Måneskin THE LONELIEST Italy"} |
| 134 | +{"index":{}} |
| 135 | +{"text":"Olivia Rodrigo drivers license United States"} |
| 136 | +{"index":{}} |
| 137 | +{"text":"OneRepublic Counting Stars United States"} |
| 138 | +{"index":{}} |
| 139 | +{"text":"Olivia Rodrigo traitor United States"} |
| 140 | +{"index":{}} |
| 141 | +{"text":"Burna Boy Alone Nigeria"} |
| 142 | +{"index":{}} |
| 143 | +{"text":"(G)I-DLE Nxde South Korea"} |
| 144 | +{"index":{}} |
| 145 | +{"text":"Vance Joy Riptide Australia"} |
| 146 | +{"index":{}} |
| 147 | +{"text":"BTS Run BTS South Korea"} |
| 148 | +{"index":{}} |
| 149 | +{"text":"Sam Smith, Kim Petras Unholy (feat. Kim Petras) United Kingdom"} |
| 150 | +{"index":{}} |
| 151 | +{"text":"Sam Smith, Kim Petras Unholy (feat. Kim Petras) Germany"} |
| 152 | +{"index":{}} |
| 153 | +{"text":"Taylor Swift Anti-Hero United States"} |
| 154 | +{"index":{}} |
| 155 | +{"text":"Drake, 21 Savage Rich Flex Canada"} |
| 156 | +{"index":{}} |
| 157 | +{"text":"Drake, 21 Savage Rich Flex United Kingdom"} |
| 158 | +{"index":{}} |
| 159 | +{"text":"Manuel Turizo La Bachata Colombia"} |
| 160 | +{"index":{}} |
| 161 | +{"text":"David Guetta, Bebe Rexha I'm Good (Blue) France"} |
| 162 | +{"index":{}} |
| 163 | +{"text":"David Guetta, Bebe Rexha I'm Good (Blue) United States"} |
| 164 | +{"index":{}} |
| 165 | +{"text":"Harry Styles As It Was United Kingdom"} |
| 166 | +{"index":{}} |
| 167 | +{"text":"Bizarrap, Quevedo Quevedo: Bzrp Music Sessions, Vol. 52 Argentina"} |
| 168 | +{"index":{}} |
| 169 | +{"text":"Bizarrap, Quevedo Quevedo: Bzrp Music Sessions, Vol. 52 Spain"} |
| 170 | +{"index":{}} |
| 171 | +{"text":"Chris Brown Under The Influence United States"} |
| 172 | +{"index":{}} |
| 173 | +{"text":"Taylor Swift Midnight Rain United States"} |
| 174 | +{"index":{}} |
| 175 | +{"text":"Taylor Swift Lavender Haze United States"} |
| 176 | +{"index":{}} |
| 177 | +{"text":"Bad Bunny, Chencho Corleone Me Porto Bonito Puerto Rico"} |
| 178 | +{"index":{}} |
| 179 | +{"text":"Bad Bunny, Chencho Corleone Me Porto Bonito Puerto Rico"} |
| 180 | +{"index":{}} |
| 181 | +{"text":"Tom Odell Another Love United Kingdom"} |
| 182 | +{"index":{}} |
| 183 | +{"text":"Oliver Tree, Robin Schulz Miss You United States"} |
| 184 | +{"index":{}} |
| 185 | +{"text":"Oliver Tree, Robin Schulz Miss You Germany"} |
| 186 | +{"index":{}} |
| 187 | +{"text":"OneRepublic I Ain't Worried United States"} |
| 188 | +{"index":{}} |
| 189 | +{"text":"Ozuna, Feid Hey Mor Puerto Rico"} |
| 190 | +{"index":{}} |
| 191 | +{"text":"Ozuna, Feid Hey Mor Colombia"} |
| 192 | +{"index":{}} |
| 193 | +{"text":"Bad Bunny Tití Me Preguntó Puerto Rico"} |
| 194 | +{"index":{}} |
| 195 | +{"text":"Steve Lacy Bad Habit United States"} |
| 196 | +{"index":{}} |
| 197 | +{"text":"Meghan Trainor Made You Look United States"} |
| 198 | +{"index":{}} |
| 199 | +{"text":"Rihanna Lift Me Up - From Black Panther: Wakanda Forever - Music From and Inspired By Barbados"} |
| 200 | +{"index":{}} |
| 201 | +{"text":"Taylor Swift, Lana Del Rey Snow On The Beach (feat. Lana Del Rey) United States"} |
| 202 | +{"index":{}} |
| 203 | +{"text":"Taylor Swift, Lana Del Rey Snow On The Beach (feat. Lana Del Rey) United States"} |
| 204 | +{"index":{}} |
| 205 | +{"text":"Taylor Swift You're On Your Own, Kid United States"} |
| 206 | +{"index":{}} |
| 207 | +{"text":"Taylor Swift Maroon United States"} |
| 208 | +{"index":{}} |
| 209 | +{"text":"Bad Bunny Efecto Puerto Rico"} |
| 210 | +{"index":{}} |
| 211 | +{"text":"Joji Die For You Jamaica"} |
| 212 | +{"index":{}} |
| 213 | +{"text":"ROSALÍA DESPECHÁ Spain"} |
| 214 | +{"index":{}} |
| 215 | +{"text":"Taylor Swift Karma United States"} |
| 216 | +{"index":{}} |
| 217 | +{"text":"Joji Glimpse of Us Jamaica"} |
| 218 | +{"index":{}} |
| 219 | +{"text":"Stephen Sanchez Until I Found You United States"} |
| 220 | +{"index":{}} |
| 221 | +{"text":"d4vd Romantic Homicide United States"} |
| 222 | +{"index":{}} |
| 223 | +{"text":"Taylor Swift Bejeweled United States"} |
| 224 | +{"index":{}} |
| 225 | +{"text":"The Weeknd Die For You Canada"} |
| 226 | +{"index":{}} |
| 227 | +{"text":"Arctic Monkeys I Wanna Be Yours United Kingdom"} |
| 228 | +{"index":{}} |
| 229 | +{"text":"Rema, Selena Gomez Calm Down (with Selena Gomez) Nigeria"} |
| 230 | +{"index":{}} |
| 231 | +{"text":"Rema, Selena Gomez Calm Down (with Selena Gomez) United States"} |
| 232 | +{"index":{}} |
| 233 | +{"text":"Beyoncé CUFF IT United States"} |
| 234 | +{"index":{}} |
| 235 | +{"text":"Bad Bunny, Bomba Estéreo Ojitos Lindos Puerto Rico"} |
| 236 | +{"index":{}} |
| 237 | +{"text":"Bad Bunny, Bomba Estéreo Ojitos Lindos Colombia"} |
| 238 | +{"index":{}} |
| 239 | +{"text":"The Neighbourhood Sweater Weather United States"} |
| 240 | +
|
| 241 | +POST _reindex?wait_for_completion=false |
| 242 | +{ |
| 243 | + "source": { |
| 244 | + "index": "elastic-bnu" |
| 245 | + }, |
| 246 | + "dest": { |
| 247 | + "index": "elastic-bnu-with-embeddings", |
| 248 | + "pipeline": "text-embeddings" |
| 249 | + } |
| 250 | +} |
| 251 | +
|
| 252 | +GET _tasks/<task_id> |
| 253 | +
|
| 254 | +GET elastic-bnu-with-embeddings/_search |
| 255 | +{ |
| 256 | + "knn": { |
| 257 | + "field": "text_embedding.predicted_value", |
| 258 | + "query_vector_builder": { |
| 259 | + "text_embedding": { |
| 260 | + "model_id": "sentence-transformers__msmarco-minilm-l-12-v3", |
| 261 | + "model_text": "Ojitos Lindos Colombia" |
| 262 | + } |
| 263 | + }, |
| 264 | + "k": 10, |
| 265 | + "num_candidates": 100 |
| 266 | + }, |
| 267 | + "_source": [ |
| 268 | + "id", |
| 269 | + "text" |
| 270 | + ] |
| 271 | +} |
| 272 | +
|
| 273 | +``` |
| 274 | + |
| 275 | +### Hybrid search with RRF |
| 276 | +- https://www.elastic.co/blog/whats-new-elastic-enterprise-search-8-9-0 |
| 277 | +- https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html |
| 278 | + |
| 279 | +```http |
| 280 | +GET elastic-bnu-with-embeddings/_search |
| 281 | +{ |
| 282 | + "query": { |
| 283 | + "term": { |
| 284 | + "text": "Lindos" |
| 285 | + } |
| 286 | + }, |
| 287 | + "knn": { |
| 288 | + "field": "text_embedding.predicted_value", |
| 289 | + "query_vector_builder": { |
| 290 | + "text_embedding": { |
| 291 | + "model_id": "sentence-transformers__msmarco-minilm-l-12-v3", |
| 292 | + "model_text": "Ojitos Lindos Colombia" |
| 293 | + } |
| 294 | + }, |
| 295 | + "k": 10, |
| 296 | + "num_candidates": 100 |
| 297 | + }, |
| 298 | + "rank": { |
| 299 | + "rrf": { |
| 300 | + "window_size": 50, |
| 301 | + "rank_constant": 20 |
| 302 | + } |
| 303 | + }, |
| 304 | + "_source": [ |
| 305 | + "id", |
| 306 | + "text" |
| 307 | + ] |
| 308 | +} |
| 309 | +``` |
0 commit comments