Implement RAG with vllm API

Is it possible to implement RAG with the given API of vllm and our deployed model on the serverless endpoint.
3 Replies
nerdylive
nerdylive3mo ago
No no within vllm I think, but with other tools it is possible
Cyber | Senpai
Cyber | SenpaiOP3mo ago
possible with langchain?
`import os
from langchain_pinecone import PineconeVectorStore
from runpod.client import Client as RunPodClient # Use RunPod client
from sentence_transformers import SentenceTransformer # Open-source embedding model

# Environment variables (replace with your keys)
os.environ['RUNPOD_API_KEY'] = '<YOUR_RUNPOD_API_KEY>'
os.environ['PINECONE_API_KEY'] = '<YOUR_PINECONE_API_KEY>'

# RunPod client setup
client = RunPodClient(
api_key=os.environ['RUNPOD_API_KEY'],
base_url="https://api.runpod.ai/v2/vllm-cf5z42rtrzdc2o/openai/v1",
)

# Set up SentenceTransformer embeddings
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2' # Example model
embeddings = SentenceTransformer(embedding_model_name)

# Function to create embeddings for a list of texts
def embed_texts(texts):
return embeddings.encode(texts)

# Connect to the existing Pinecone index
index_name = "chatbot-v1"
vectorstore = PineconeVectorStore(index_name=index_name, embedding_function=embed_texts)

# Query the existing index
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# Set up the language model with RunPod-hosted model
llm = ChatOpenAI(
openai_api_key=os.environ['RUNPOD_API_KEY'],
model_name='openchat/openchat-3.5-1210', # Use your model's name
temperature=0.0
)

# Set up Retrieval QA chain
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever()
)

# Example query
query = "Who was Benito Mussolini?"
response = qa.run(query)

# Print the response
print(response)
`import os
from langchain_pinecone import PineconeVectorStore
from runpod.client import Client as RunPodClient # Use RunPod client
from sentence_transformers import SentenceTransformer # Open-source embedding model

# Environment variables (replace with your keys)
os.environ['RUNPOD_API_KEY'] = '<YOUR_RUNPOD_API_KEY>'
os.environ['PINECONE_API_KEY'] = '<YOUR_PINECONE_API_KEY>'

# RunPod client setup
client = RunPodClient(
api_key=os.environ['RUNPOD_API_KEY'],
base_url="https://api.runpod.ai/v2/vllm-cf5z42rtrzdc2o/openai/v1",
)

# Set up SentenceTransformer embeddings
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2' # Example model
embeddings = SentenceTransformer(embedding_model_name)

# Function to create embeddings for a list of texts
def embed_texts(texts):
return embeddings.encode(texts)

# Connect to the existing Pinecone index
index_name = "chatbot-v1"
vectorstore = PineconeVectorStore(index_name=index_name, embedding_function=embed_texts)

# Query the existing index
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# Set up the language model with RunPod-hosted model
llm = ChatOpenAI(
openai_api_key=os.environ['RUNPOD_API_KEY'],
model_name='openchat/openchat-3.5-1210', # Use your model's name
temperature=0.0
)

# Set up Retrieval QA chain
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever()
)

# Example query
query = "Who was Benito Mussolini?"
response = qa.run(query)

# Print the response
print(response)
` or any tool you recommend?
nerdylive
nerdylive3mo ago
Yes I think langchain will make it easier
Want results from more Discord servers?
Add your server