sample-apps/retrieval-augmented-generation/services.xml at master · vespa-engine/sample-apps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
<?xml version="1.0" encoding="utf-8" ?>
<!-- Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
<services version="1.0" xmlns:deploy="vespa" xmlns:preprocess="properties" minimum-required-vespa-version="8.327.49">

  <container id="default" version="1.0">

    <document-api/>

    <secrets>
      <openai-api-key vault="my-vault-name" name="my-secret-name"/>
    </secrets>

    <!-- Setup the client to OpenAI -->
    <component id="openai" class="ai.vespa.llm.clients.OpenAI">
      <config name = "ai.vespa.llm.clients.llm-client">
        <apiKeySecretName>openai-api-key</apiKeySecretName>
      </config>
    </component>

    <!-- Setup a local inference on a small Llama 3.2 text model -->
    <!-- Comment out this component to avoid downloading the model file during startup -->
    <component id="phi" class="ai.vespa.llm.clients.LocalLLM">
      <config name="ai.vespa.llm.clients.llm-local-client">
        <model url="https://data.vespa-cloud.com/gguf_models/llama-3.2-1b-instruct-q8_0.gguf" />
        <contextSize>4096</contextSize>
        <parallelRequests>1</parallelRequests>
      </config>
    </component>

    <search>

      <chain id="openai" inherits="vespa">
        <searcher id="ai.vespa.search.llm.RAGSearcher">
          <config name="ai.vespa.search.llm.llm-searcher">
            <providerId>openai</providerId>
          </config>
        </searcher>
      </chain>

      <chain id="local" inherits="vespa">
        <searcher id="ai.vespa.search.llm.RAGSearcher">
          <config name="ai.vespa.search.llm.llm-searcher">
            <providerId>phi</providerId>
          </config>
        </searcher>
      </chain>

    </search>

    <nodes count="1" deploy:environment="dev">
      <resources vcpu="4.0" memory="16Gb" architecture="x86_64" storage-type="local" disk="125Gb">
        <gpu count="1" memory="16.0Gb"/>
      </resources>
    </nodes>

  </container>

  <content id="msmarco" version="1.0">
    <redundancy>1</redundancy>
    <documents>
      <document mode="index" type="passage"/>
    </documents>
    <nodes>
      <node hostalias="node1" distribution-key="0" />
    </nodes>
  </content>

</services>