Implement RAG with vllm API
Is it possible to implement RAG with the given API of vllm and our deployed model on the serverless endpoint.
3 Replies
No no within vllm I think, but with other tools it is possible
possible with langchain?
`
or any tool you recommend?
`import os
from langchain_pinecone import PineconeVectorStore
from runpod.client import Client as RunPodClient # Use RunPod client
from sentence_transformers import SentenceTransformer # Open-source embedding model
# Environment variables (replace with your keys)
os.environ['RUNPOD_API_KEY'] = '<YOUR_RUNPOD_API_KEY>'
os.environ['PINECONE_API_KEY'] = '<YOUR_PINECONE_API_KEY>'
# RunPod client setup
client = RunPodClient(
api_key=os.environ['RUNPOD_API_KEY'],
base_url="https://api.runpod.ai/v2/vllm-cf5z42rtrzdc2o/openai/v1",
)
# Set up SentenceTransformer embeddings
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2' # Example model
embeddings = SentenceTransformer(embedding_model_name)
# Function to create embeddings for a list of texts
def embed_texts(texts):
return embeddings.encode(texts)
# Connect to the existing Pinecone index
index_name = "chatbot-v1"
vectorstore = PineconeVectorStore(index_name=index_name, embedding_function=embed_texts)
# Query the existing index
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
# Set up the language model with RunPod-hosted model
llm = ChatOpenAI(
openai_api_key=os.environ['RUNPOD_API_KEY'],
model_name='openchat/openchat-3.5-1210', # Use your model's name
temperature=0.0
)
# Set up Retrieval QA chain
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever()
)
# Example query
query = "Who was Benito Mussolini?"
response = qa.run(query)
# Print the response
print(response)
`import os
from langchain_pinecone import PineconeVectorStore
from runpod.client import Client as RunPodClient # Use RunPod client
from sentence_transformers import SentenceTransformer # Open-source embedding model
# Environment variables (replace with your keys)
os.environ['RUNPOD_API_KEY'] = '<YOUR_RUNPOD_API_KEY>'
os.environ['PINECONE_API_KEY'] = '<YOUR_PINECONE_API_KEY>'
# RunPod client setup
client = RunPodClient(
api_key=os.environ['RUNPOD_API_KEY'],
base_url="https://api.runpod.ai/v2/vllm-cf5z42rtrzdc2o/openai/v1",
)
# Set up SentenceTransformer embeddings
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2' # Example model
embeddings = SentenceTransformer(embedding_model_name)
# Function to create embeddings for a list of texts
def embed_texts(texts):
return embeddings.encode(texts)
# Connect to the existing Pinecone index
index_name = "chatbot-v1"
vectorstore = PineconeVectorStore(index_name=index_name, embedding_function=embed_texts)
# Query the existing index
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
# Set up the language model with RunPod-hosted model
llm = ChatOpenAI(
openai_api_key=os.environ['RUNPOD_API_KEY'],
model_name='openchat/openchat-3.5-1210', # Use your model's name
temperature=0.0
)
# Set up Retrieval QA chain
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever()
)
# Example query
query = "Who was Benito Mussolini?"
response = qa.run(query)
# Print the response
print(response)
Yes
I think langchain will make it easier