LangChain Embeddings
Open Text Embeddings¶
In [ ]:
%%bash
pip install --upgrade llama-cpp-python
In [ ]:
%%bash
pip install --upgrade git+https://github.com/huggingface/transformers
In [ ]:
%%bash
pip install tokenizers==0.13.3 protobuf==3.20.*
In [ ]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('chavinlo/gpt4-x-alpaca')
model = AutoModelForSequenceClassification.from_pretrained('chavinlo/gpt4-x-alpaca', resume_download=True)
In [ ]:
%%bash
ls -lah ~/.cache/huggingface/hub/models--chavinlo--gpt4-x-alpaca/snapshots/6a571f458cab9a23d14324ec63e0abd1744c8353
In [ ]:
%%bash
ls -lah ~/.cache/huggingface/hub/models--chavinlo--gpt4-x-alpaca/blobs
In [ ]:
import os
from langchain.embeddings import LlamaCppEmbeddings
alpaca_embeddings = LlamaCppEmbeddings(model_path=os.path.expanduser('~/.cache/huggingface/hub/models--chavinlo--gpt4-x-alpaca/snapshots/6a571f458cab9a23d14324ec63e0abd1744c8353/model.bin'))
In [ ]:
# 准备文本
text = '这是一个测试文档。'
# 使用 HuggingFaceEmbeddings 生成文本嵌入
query_result = alpaca_embeddings.embed_query(text)
doc_result = alpaca_embeddings.embed_documents([text])
print(len(query_result))
# print(query_result)
print(len(doc_result))
print(len(doc_result[0]))
# print(doc_result)
In [ ]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id='Pi3141/gpt4-x-alpaca-native-13B-ggml', filename='consolidated.00.pth', resume_download=True)
Out[ ]:
In [ ]:
# 用 Python 去掉文件中最后一个字节
import os
with open(os.path.expanduser('~/.cache/huggingface/hub/models--Pi3141--gpt4-x-alpaca-native-13B-ggml/blobs/8d308284e190467111257950d4e8b34b1e3f19a70636fa6ea51dfa62f4cf5b55.incomplete'), 'rb+') as filehandle:
filehandle.seek(-1, os.SEEK_END)
filehandle.truncate()
In [ ]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id='Pi3141/gpt4-x-alpaca-native-13B-ggml', filename='ggml-model-q4_1.bin', resume_download=True)
Out[ ]:
In [ ]:
from huggingface_hub import snapshot_download
snapshot_download(repo_id='Pi3141/gpt4-x-alpaca-native-13B-ggml', resume_download=True)
Out[ ]:
In [ ]:
%%bash
ls -lah ~/.cache/huggingface/hub/models--Pi3141--gpt4-x-alpaca-native-13B-ggml/snapshots/43cce6aab1b95712d83165afafa3c7baad140eb9
In [ ]:
%%bash
ls -lah ~/.cache/huggingface/hub/models--Pi3141--gpt4-x-alpaca-native-13B-ggml/blobs
In [ ]:
import os
from langchain.embeddings import LlamaCppEmbeddings
alpaca_embeddings = LlamaCppEmbeddings(model_path=os.path.expanduser('~/ggml-model-q4_1.bin'))
In [ ]:
# 准备文本
text = '这是一个测试文档。'
# 使用 HuggingFaceInstructEmbeddings 生成文本嵌入
query_result = alpaca_embeddings.embed_query(text)
doc_result = alpaca_embeddings.embed_documents([text])
print(len(query_result))
# print(query_result)
print(len(doc_result))
print(len(doc_result[0]))
# print(doc_result)
In [ ]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
In [ ]:
def get_docs(dir_name):
# (1) Import a series of documents.
loader = DirectoryLoader(dir_name, loader_cls=TextLoader, silent_errors=True)
raw_documents = loader.load()
# (2) Split them into small chunks.
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1024,
chunk_overlap=64,
)
return text_splitter.split_documents(raw_documents)
In [ ]:
len(get_docs('_posts/ultimate-facts'))
Out[ ]:
In [ ]:
get_docs('_posts/ultimate-facts')[0]
Out[ ]:
In [ ]:
get_docs('_posts/ultimate-facts')[1]
Out[ ]:
In [ ]:
get_docs('_posts/ultimate-facts')[2]
Out[ ]:
In [ ]:
get_docs('_posts/ultimate-facts')[3]
Out[ ]:
In [ ]:
import os
from langchain.embeddings import LlamaCppEmbeddings
from langchain.vectorstores.faiss import FAISS
In [ ]:
def ingest_docs(dir_name):
documents = get_docs(dir_name)
# (3) Create embeddings for each document (using text-embedding-ada-002).
embeddings = LlamaCppEmbeddings(model_path=os.path.expanduser('~/ggml-model-q4_1.bin'), n_ctx=2048)
return FAISS.from_documents(documents, embeddings)
vectorstore = ingest_docs('_posts/ultimate-facts')
In [ ]:
import pickle
In [ ]:
# Save vectorstore
with open('vectorstore_13B_2048.pkl', 'wb') as f:
pickle.dump(vectorstore, f)
In [ ]:
# Load vectorstore
with open('vectorstore_13B_2048.pkl', 'rb') as f:
vectorstore = pickle.load(f)
In [ ]:
question = '你知道什么?'
In [ ]:
# Get context related to the question from the embedding model
for context in vectorstore.similarity_search(question):
print(f'{context}\n')
Open Text Embeddings¶
In [ ]:
%%bash
pip install --upgrade sentence-transformers
In [ ]:
from langchain.embeddings import HuggingFaceEmbeddings
In [ ]:
help(HuggingFaceEmbeddings)
In [ ]:
help(HuggingFaceEmbeddings.__init__)
In [ ]:
HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
In [ ]:
hf_embeddings = HuggingFaceEmbeddings()
# 准备文本
text = '这是一个测试文档。'
# 使用 HuggingFaceEmbeddings 生成文本嵌入
query_result = hf_embeddings.embed_query(text)
doc_result = hf_embeddings.embed_documents([text])
print(len(query_result))
# print(query_result)
print(len(doc_result))
print(len(doc_result[0]))
# print(doc_result)
In [ ]:
hf_embeddings.model_name
Out[ ]:
In [ ]:
%%bash
ls -lah ~/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2
In [ ]:
%%bash
du -sh ~/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2
In [ ]:
%%bash
pip install --upgrade InstructorEmbedding
In [ ]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
hfi_embeddings = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-large')
In [ ]:
hfi_embeddings.model_name
Out[ ]:
In [ ]:
%%bash
ls -lah ~/.cache/torch/sentence_transformers/hkunlp_instructor-large
In [ ]:
%%bash
du -sh ~/.cache/torch/sentence_transformers/hkunlp_instructor-large
In [ ]:
# 准备文本
text = '这是一个测试文档。'
# 使用 HuggingFaceInstructEmbeddings 生成文本嵌入
query_result = hfi_embeddings.embed_query(text)
doc_result = hfi_embeddings.embed_documents([text])
print(len(query_result))
print(query_result)
print(len(doc_result))
print(len(doc_result[0]))
print(doc_result)
In [ ]:
from langchain.embeddings.huggingface import DEFAULT_QUERY_INSTRUCTION, DEFAULT_EMBED_INSTRUCTION
In [ ]:
print(DEFAULT_QUERY_INSTRUCTION)
print(DEFAULT_EMBED_INSTRUCTION)
Preparing Documents
In [ ]:
%%bash
wget --recursive --no-parent --accept=.html --directory-prefix _morning --no-clobber http://ailingmusheng.ren/7/2022djth/2022-7_0008.html
In [ ]:
import os
import shutil
# 定义一个函数来递归遍历目录树,找到名为 .ipynb_checkpoints 的子目录并删除它们。
def remove_checkpoints(dir_path):
for root, dirs, files in os.walk(dir_path):
for name in dirs:
if name == '.ipynb_checkpoints':
shutil.rmtree(os.path.join(root, name))
# 调用函数来删除目录下所有名为 .ipynb_checkpoints 的子目录。
remove_checkpoints('_morning')
In [ ]:
import os
def delete_ds_store_files(path):
for root, dirs, files in os.walk(path):
for name in files:
if name == '.DS_Store':
os.remove(os.path.join(root, name))
delete_ds_store_files('_morning')
In [ ]:
import os
import shutil
from pathlib import Path
# 1. 用户设定目录路径。
directory_path = '_morning'
# 2. 获取目录及其子目录下的所有文件,并按照扩展名分类。
file_extension_map = {}
for root, dirs, files in os.walk(directory_path):
for file_name in files:
file_path = os.path.join(root, file_name)
file_extension = Path(file_name).suffix.lower()
if file_extension not in file_extension_map:
file_extension_map[file_extension] = []
file_extension_map[file_extension].append(file_path)
# 3. 创建新目录,并将同样扩展名的文件移动到该目录下。
for file_extension, file_list in file_extension_map.items():
new_directory_path = os.path.join(directory_path, file_extension[1:])
for file_path in file_list:
new_file_path = os.path.join(new_directory_path, os.path.relpath(file_path, directory_path))
os.makedirs(os.path.dirname(new_file_path), exist_ok=True) # 创建目录,存在则不创建。
shutil.move(file_path, new_file_path)
In [ ]:
%%bash
pip install chardet
In [ ]:
import chardet
# 读取文件内容
with open('_morning/html/ailingmusheng.ren/7/2022djth/2022-7_0008.html', 'rb') as f:
content = f.read()
# 检测文件内容的编码类型
result = chardet.detect(content)
# 输出编码类型和可信度
print('编码类型:', result['encoding'])
print('可信度:', result['confidence'])
In [ ]:
import os
import shutil
def remove_xml_lines(path):
for root, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(root, file)
with open(file_path) as f:
lines = f.readlines()
with open(file_path, 'w') as f:
for line in lines:
if line != "<?xml version='1.0' encoding='utf-8'?>\n":
f.write(line)
remove_xml_lines('_morning/htm')
remove_xml_lines('_morning/html')
In [ ]:
from langchain.document_loaders import DirectoryLoader, BSHTMLLoader
loader = DirectoryLoader('_morning/htm', loader_cls=BSHTMLLoader)
raw_documents = loader.load()
In [ ]:
import re
for raw_document in raw_documents:
raw_document.page_content = re.sub(r'\n+', '\n', raw_document.page_content.replace('。', '。\n'))
In [ ]:
# from langchain.document_loaders import DirectoryLoader, BSHTMLLoader
# loader = DirectoryLoader('_morning/html', loader_cls=BSHTMLLoader)
# raw_documents = loader.load()
- https://github.com/hwchase17/langchain/blob/master/langchain/text_splitter.py => RecursiveCharacterTextSplitter
- https://github.com/hwchase17/langchain/blob/master/langchain/schema.py => Document
In [ ]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=64,
)
documents = text_splitter.split_documents(raw_documents)
In [ ]:
from langchain.docstore.document import Document
# import pdb; pdb.set_trace()
text_splitter.split_documents([Document(page_content='第四周历代志、以斯拉记、尼希米记、以斯帖记结晶读经第四周借着神的申言者神圣的鼓励,恢复神殿的建造周四、周五叁撒迦利亚书启示,灯台的七灯(四2,启四5)是神的七灵,七倍加强的灵(一4),就是耶和华的七眼(亚四10),也是救赎之羔羊的七眼(启五6),以及建造之石头的七眼(亚三9),为着三一神完满的彰显和神殿的重建:一在撒迦利亚三章九节里,这块安置在约书亚面前的石头,预表基督是神建造的石头(诗一一八22,太二一42);耶和华要雕刻这石头,指明基督在十字架上受死时,乃是被神雕刻、剪除;耶和华要在一日之间除掉那地的罪孽,指明神在其上作工的基督,要在一日之间,就是在祂钉十字架之日,除掉以色列地的罪;借着祂在十字架上的死,神的羔羊基督除去了世人的罪(彼前二24,约一29):1石头、耶和华和羔羊乃是一;基督是救赎的羔羊和建造的石头,也是耶和华;基督乃是羔羊石头—羔羊为着救赎,石头为着建造—启五6,亚三9。2在神的建造里,基督是基石,托住神的建造;是房角石,将祂身体上外邦和犹太的肢体联络在一起;也是恩典的顶石,完成神建造中的一切—赛二八16,林前三11,弗二20,彼前二6,亚四7。3神的羔羊基督是那有七眼之建造的石头,这启示基督的七眼乃是为着神的建造—约一29,亚三9,启五6。4基督是建造的石头,有七眼,就是七灵,为要将祂自己灌注到我们里面,好把我们变化为宝贵的材料,为着神的建造;当主注视我们,祂的七眼就将祂自己灌注到我们里面—亚三9,林前三12,启三1,五6。二为着完成神的建造,七倍加强的灵是基督这救赎之羔羊和建造之石头的眼睛,鉴察并搜寻我们,并用基督的素质、丰富和负担,注入并灌注到我们里面,为着神的建造—亚三9,四7,启一14,五6:1羔羊的七眼,将基督这法理的救赎者注入我们里面;石头的七眼,将基督这生机的拯救者注入我们里面,目的是为着神在地上经纶的行动,要借着祂法理的救赎,凭着祂生机的拯救,达到祂建造的目标—约一29,徒四11~12,罗五10。2在我们里面有两盏灯—神七倍加强的灵在我们的灵里(箴二十27,启四5,林前六17);我们要被变化,就必须在祷告中向主完全敞开,让主的灯同着七盏火灯搜寻我们魂里的每一个房间,照耀并光照我们内里的各部分,用生命供应各部分。3经历最大变化的人,乃是向主完全敞开的人;借着七倍加强的灵在寻求基督之信徒里的运行,他们就得着加强,成为得胜者,以建造基督的身体,终极完成新耶路撒冷。三基督这位末后的亚当,在复活里成了赐生命的灵(十五45下,约六63上,林后三6下),祂也是七倍加强的灵;这灵就是生命的灵(罗八2);因此,七灵的功用乃是将神圣的生命分赐到神的子民里面,为着建造神永远的居所新耶路撒冷。四七倍加强的灵乃是七盏火灯,焚烧、光照、暴露、搜寻、审判、洁净并炼净我们,好产生金灯台,完成神新约的经纶—启四5,一2、4、9~12、20。第四周周五晨兴喂养箴二十27人的灵是耶和华的灯,鉴察人的深处。启四5……有七盏火灯在宝座前点着,这七灯就是神的七灵。谁经历最大量的变化?就是向主完全敞开的人。……“主,我向你完全敞开。我要一直向你敞开。我的全人—我的心、我的心思、我的意志和我的情感—是敞开的。求你一直照耀,彻底鉴察我,光照并点活我。我愿完全接受你的光照。”这样,光会渗透每一部分,同时生命会供应给你。泥土所造的人要变化成为基督的形像。随着金这样成形在你里面,就会有七灵照耀并彰显神。愿我们众人向祂敞开,接受祂的光照,并让祂的生命供应我们。然后我们就会变化,并有基督的形像。我们蒙里面的灯光照,就会实际地在我们的地方上成为金灯台,彰显三一神。这样,祂就要得着祂的见证(李常受文集一九七九年第一册,五○七至五○八页)。信息选读这包罗万有、超绝、奇妙、奥秘、美妙的一位,乃是神行政的执行者。……因为祂有资格,因为祂配〔参启五4~6〕,所以七印交给了祂。这一位有资格揭开七印,执行神的经纶。祂执行神经纶的方式,乃是凭着七灵作祂的眼睛。……基督是神经纶的焦点执行者,但祂需要七灵作祂的眼睛,来执行神的经纶。今天七灵在地上焚烧,为着执行神的行政。……焚烧的火焰执行神的经纶,目的是要产生金灯台,众召会。焚烧含示审判、洁净、炼净、产生。……我不相信在世界或召会里似乎令人失望的光景。我相信焚烧之七灵的火焰,支配并指引世界,也审判、洁净并炼净召会,要产生一个纯金的灯台。我们在这里尽量给主机会和入口,来审判我们、洁净我们并炼净我们,好产生一个纯金的灯台。我们向着神七灵的焚烧大大敞开。我们都需要祷告:“亲爱的神圣火焰,来吧!来审判!来洁净!来炼净,使你能产生金灯台。”……因着祂的怜悯,我们向祂敞开。我们每天、每早、每晚都需要祷告:“主,来吧;我们向你敞开!我们全人的每一通道都向你敞开。”……我能作见证,我几乎天天祷告:“主,光照我;主,搜寻我里面,并且暴露我。我喜欢被你光照,并且在你的光中被暴露。”……我们都必须祷告:“主,我们是敞开的。来照耀在我们身上,从我们里面照耀,光照我们全人的每一通道、每一角落。我喜欢被暴露、被清理、被炼净。”这样,主就有路产生纯金的灯台。出自永远者和救赎者的七灵,乃是在神宝座前点着的七盏火灯,在宇宙中执行神的经纶;也是被杀之羔羊的七眼,搜寻并灌注众召会(四5,五6下)。七灵的双重使命乃是执行神的行政,以及搜寻并灌注众召会。七灵搜寻出我们的罪恶,并以基督的丰富灌注我们。当人和你说话的时候,他的两眼同时把他的负担灌注到你里面。照样,神的七灵作为羔羊的眼睛,也把这位奇妙者的负担和素质灌注到我们里面(李常受文集一九八四年第三册,四四八至四五二页)。参读:生命信息,第六十八至七十章;神新约的经纶,第二十三章。确定的话定住的光启示:七灵乃是在神宝座前点着的七盏火灯。经历:七灯来焚烧、光照、暴露、搜寻、审判、洁净并炼净我们。应用:借长时祷告向主完全敞开。一句话:“亲爱的神圣火焰,来吧!来审判!来洁净!来炼净!”', metadata={})])
In [ ]:
list('这是一个测试文档。')
Out[ ]:
Embedding Documents
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
In [ ]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
In [ ]:
hfi_embeddings = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-large')
tqdm
是一个 Python 库,用于在循环中添加进度条。它可以用于任何可迭代的对象,如列表、元组、字典、文件等。它提供了一个简单的 API ,可以轻松地将进度条添加到循环中。以下是一个简单的示例代码:
In [ ]:
from tqdm.autonotebook import trange
for i in trange(1000000):
pass
In [ ]:
from tqdm import tqdm
for i in tqdm(range(1000000)):
pass
在这个例子中,我们使用了 tqdm
库来添加一个进度条到循环中。我们使用了内置的 range()
函数来生成一个包含 1000000 个元素的迭代器,并将其传递给 tqdm()
函数。然后,我们使用了一个简单的循环来遍历这个迭代器,并在每次迭代时调用 tqdm.update()
方法来更新进度条。
Loading Faiss Indexer from Disk
In [ ]:
import os, pickle
In [ ]:
if os.path.exists(os.path.expanduser('~/vectorstore_morning.pkl')):
# load vectorstore
with open(os.path.expanduser('~/vectorstore_morning.pkl'), 'rb') as f:
vectorstore = pickle.load(f)
Embedding and Indexing
from langchain.vectorstores.faiss import FAISS
vectorstore = FAISS.from_documents(documents, embeddings)
In [ ]:
from langchain.vectorstores.faiss import FAISS
In [ ]:
if not os.path.exists(os.path.expanduser('~/vectorstore_morning.pkl')):
vectorstore = FAISS.from_documents(documents, hfi_embeddings)
Saving Faiss Indexer to Disk
In [ ]:
if not os.path.exists(os.path.expanduser('~/vectorstore_morning.pkl')):
# save vectorstore
with open(os.path.expanduser('~/vectorstore_morning.pkl'), 'wb') as f:
pickle.dump(vectorstore, f)
In [ ]:
%%bash
ls -lah ~/vectorstore_morning.pkl
Similarity Searching
In [ ]:
question = '你知道什么?'
In [ ]:
# get context related to the question from the embedding model
vectorstore.similarity_search(question, 30)
Out[ ]:
In [ ]:
question = '第八周讲了什么?'
In [ ]:
# get context related to the question from the embedding model
vectorstore.similarity_search(question, 30)
Out[ ]:
In [ ]:
question = '七倍加强的灵是什么?'
In [ ]:
# get context related to the question from the embedding model
vectorstore.similarity_search(question, 30)
Out[ ]:
Mock OpenAI
In [ ]:
import json, os
from revChatGPT.V1 import Chatbot, configure
# open the JSON file and read the conversation_id
with open(os.path.expanduser('~/.config/revChatGPT/config.json'), 'r') as f:
conversation_id = json.load(f).get('conversation_id', None)
bot = Chatbot(
config = configure(),
conversation_id = conversation_id,
lazy_loading = True
)
In [ ]:
class attrdict(dict):
def __getattr__(self, attr):
return self.get(attr)
def attributize(obj):
'''Add attributes to a dictionary and its sub-dictionaries.'''
if isinstance(obj, dict):
for key in obj:
obj[key] = attributize(obj[key])
return attrdict(obj)
if isinstance(obj, list):
return [attributize(item) for item in obj]
return obj
def delta(prompt):
res = ''
for response in bot.ask(prompt):
yield attributize({
'choices': [
{
'index': 0,
'delta': {
'content': response['message'][len(res):],
}
}
],
})
res = response['message']
def mock_create(*args, **kwargs):
summarized_prompt = ''
for message in kwargs['messages']:
summarized_prompt += f"{message['role']}:\n\n{message['content']}\n\n\n"
summarized_prompt.strip()
if kwargs.get('stream', False):
return delta(summarized_prompt)
for response in bot.ask(summarized_prompt):
pass
return attributize({
'choices': [
{
'finish_reason': 'stop',
'index': 0,
'message': {
'content': response['message'],
'role': 'assistant',
}
}
],
})
In [ ]:
import openai, pytest
In [ ]:
@pytest.fixture
def mock_openai(monkeypatch):
monkeypatch.setattr(openai.ChatCompletion, 'create', mock_create)
QA with Similarity Searching
In [ ]:
from langchain.prompts import PromptTemplate
CONDENSE_QUESTION_PROMPT = PromptTemplate(
input_variables=['chat_history', 'question'],
output_parser=None, partial_variables={},
template='给定以下对话和后续问题,请重新表述后续问题以成为一个独立问题。\n\n聊天记录:\n{chat_history}\n后续问题:{question}\n独立问题:',
template_format='f-string',
validate_template=True
)
QA_PROMPT = PromptTemplate(
input_variables=['context', 'question'],
output_parser=None, partial_variables={},
template='使用下面的背景信息回答最后的问题。如果您不知道答案,请直接说您不知道,不要试图编造一个答案。\n\n背景信息:\n{context}\n\n问题:{question}\n有用的答案:',
template_format='f-string',
validate_template=True
)
In [ ]:
from langchain.chains.llm import LLMChain
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores.base import VectorStore
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
# Callback function to stream answers to stdout.
manager = CallbackManager([StreamingStdOutCallbackHandler()])
streaming_llm = ChatOpenAI(streaming=True, callback_manager=manager, verbose=True, temperature=0)
question_gen_llm = ChatOpenAI(temperature=0, verbose=True, callback_manager=manager)
# Prompt to generate independent questions by incorporating chat history and a new question.
question_generator = LLMChain(llm=question_gen_llm, prompt=CONDENSE_QUESTION_PROMPT)
# Pass in documents and a standalone prompt to answer questions.
doc_chain = load_qa_chain(streaming_llm, chain_type='stuff', prompt=QA_PROMPT)
# Generate prompts from embedding model.
qa = ConversationalRetrievalChain(retriever=vectorstore.as_retriever(), combine_docs_chain=doc_chain, question_generator=question_generator)
In [ ]:
question = '七倍加强的灵是什么?'
In [ ]:
answer = {}
In [ ]:
def test_qa(mock_openai):
global answer
answer = qa({'question': question, 'chat_history': []})
print('\n')
assert isinstance(answer, dict)
In [ ]:
from ipymock import do
In [ ]:
do(
mock_openai=mock_openai,
test_qa=test_qa,
)
In [ ]:
answer
Out[ ]:
Instructor Transformer
In [ ]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-large')
Faiss Indexer
In [ ]:
import faiss
index = faiss.IndexFlatL2(768)
Embedding and Indexing
In [ ]:
sentences = None
In [ ]:
num_words = 20
In [ ]:
instruction = 'Represent the query for retrieval:'
In [ ]:
instruction = '表示用于检索的查询:'
In [ ]:
def index_embeddings(text):
global sentences
# words = extra_text.split(' ')
# sentences = [words[i: i+num_words] for i in range(0, len(words), num_words)]
# sentences = [' '.join(word_list) for word_list in sentences]
import re
sentences = []
for i, sentence in enumerate(re.split(r'\n+', text.replace('。', '。\n'))):
sentence = sentence.strip()
if sentence != '':
sentences.append(sentence)
print('\nNumber of Sentences:', len(sentences))
# print(sentences)
print('\nBuilding the index...')
embeddings = model.encode([[instruction, i] for i in sentences])
index.add(embeddings)
print('\nindex.ntotal:', index.ntotal)
In [ ]:
index_embeddings(''.join([raw_document.page_content for raw_document in raw_documents]))
Faiss Search
In [ ]:
k = 20
In [ ]:
def retrieve_extra_info(text):
print('\nRetrieving extra information...')
xq = model.encode([[instruction, text]])
D, I = index.search(xq, k)
print(D[0])
print(I[0])
extra_info = ''
for i in I[0]:
try:
extra_info += sentences[i] + '\n'
except:
print(len(sentences), i)
print('\nextra_info:', extra_info)
return extra_info
In [ ]:
retrieve_extra_info('你知道什么?')
retrieve_extra_info('第八周讲了什么?')
retrieve_extra_info('七倍加强的灵是什么?')
Out[ ]:
Chat with PDF
In [ ]:
import logging
logging.getLogger().setLevel(logging.CRITICAL)
In [ ]:
messages_in_english = [{
'role': 'system', 'content': 'You are an AI agent that summarizes chat in less than three setences.'
}]
In [ ]:
messages_in_chinese = [{
'role': '系统', 'content': '你是一个 AI 代理。请用中文在三句话之内概括聊天内容。'
}]
In [ ]:
chats_in_english = [{
'role': 'system', 'content': 'You are an AI assistant providing helpful advice.\n' + \
'You are given the following extracted parts of a long document and a question.\n' + \
'Provide a conversational answer based on the context provided.\n' + \
'You should only provide hyperlinks that reference the context below.\n' + \
'Do NOT make up hyperlinks.\n' + \
'If you can\'t find the answer in the context below, use your prior knowledge,\n' + \
'but in most of the cases the answer will be in the context.\n' + \
# 'If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.\n' + \
'Answer in Markdown format.\n'
}]
In [ ]:
chats_in_chinese = [{
'role': '系统', 'content': '你是一个提供有用建议的 AI 助手。\n' + \
'你被提供了一份长文档的一部分(额外信息)和一个问题。\n' + \
'请根据我所提供的文本提供会话式的回答。\n' + \
'你只应该提供与下面的文本相关的超链接。\n' + \
'**不要**编造超链接。\n' + \
'如果在下面的文本中找不到答案,可以使用你先前所知道的知识,\n' + \
'但在大多数情况下,答案是在文本中的。\n' + \
# '如果问题与上下文不相关,请礼貌地回复您只回答与上下文相关的问题。\n' + \
'请用中文以 Markdown 格式回答。\n'
}]
In [ ]:
%%bash
pip install PyPDF2
In [ ]:
import PyPDF2
def extract_text(pdf_file):
'''Extract text from a PDF file.'''
with open(pdf_file.name, 'rb') as f:
return '\n\n'.join([page.extract_text() for page in PyPDF2.PdfReader(f).pages])
In [ ]:
def build_the_bot(pdf_file, openai_key=None):
'''split sentences in chinese'''
openai.api_key = openai_key
print('OpenAI Key:', openai_key)
extra_text = extract_text(pdf_file)
print('\nText Length:', len(extra_text))
index_embeddings(extra_text)
return extra_text
In [ ]:
import openai
In [ ]:
def chat(chat_history, user_input):
'''chat in chinese'''
global sentences
print('\nmessages_in_chinese:', messages_in_chinese)
# messages_in_english.append({'role': 'user', 'content': 'Question:\n' + user_input})
# print('\nmessages_in_english:', messages_in_english)
print('\nSummarizing the chat history...')
completion = openai.ChatCompletion.create(
model = 'gpt-3.5-turbo',
temperature = 0,
messages = messages_in_chinese
)
summary = completion.choices[0].message.content
print(f'\nSummarized Histoy: {summary}')
extra_info = retrieve_extra_info(summary + '\n\n' + '问题:' + user_input)
chats_in_chinese.append({'role': '用户', 'content': '额外信息:\n' + extra_info + '\n\n' + '问题:' + user_input})
print('\nchats_in_chinese:', chats_in_chinese)
completion = openai.ChatCompletion.create(
model = 'gpt-3.5-turbo',
temperature = 0,
messages = chats_in_chinese[:1] + chats_in_chinese[-1:]
)
chat_output = completion.choices[0].message.content
print(f'\nChatGPT: {chat_output}')
# messages_in_chinese.append({'role': '用户', 'content': user_input})
# messages_in_chinese.append({'role': '助手', 'content': chat_output})
yield chat_history + [(user_input, chat_output)]
In [ ]:
%%bash
pip install gradio
In [ ]:
import gradio
In [ ]:
def test_demo(mock_openai):
with gradio.Blocks() as demo:
gradio.Markdown('Chat with a PDF document')
with gradio.Tab('Select PDF'):
pdf = gradio.File()
openai_key = gradio.Textbox(label='OpenAI API Key',)
text_output = gradio.Textbox(label='PDF content')
text_button = gradio.Button('Build the Bot!!!')
text_button.click(build_the_bot, [pdf, openai_key], text_output)
with gradio.Tab('Knowledge Bot'):
chatbot = gradio.Chatbot()
message = gradio.Textbox('What is this document about?')
message.submit(chat, [chatbot, message], chatbot)
demo.queue().launch(debug = True)
assert True
demo.close()
In [ ]:
do(
mock_openai=mock_openai,
test_demo=test_demo,
)