Text2Vec
Text2Vec¶
- Sentence Transformer
- CoSENT hfl/chinese-macbert-base
- CoSENT hfl/chinese-lert-large
- GanymedeNil/text2vec-large-chinese
In [ ]:
from sentence_transformers import SentenceTransformer
all-mpnet-base-v2
In [ ]:
SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
Out[ ]:
In [ ]:
%%bash
ls -lah ~/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2
In [ ]:
%%bash
cat ~/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2/modules.json
In [ ]:
%%bash
cat ~/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2/sentence_bert_config.json
In [ ]:
%%bash
cat ~/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2/1_Pooling/config.json
In [ ]:
%%bash
du -sh ~/.cache/torch/sentence_transformers/sentence-transformers_all-mpnet-base-v2/data_config.json
all-MiniLM-L6-v2
In [ ]:
SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
Out[ ]:
In [ ]:
%%bash
ls -lah ~/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2
CoSENT hfl/chinese-macbert-base
Full Model Architecture:
CoSENT(
(0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_mean_tokens': True})
)
In [ ]:
SentenceTransformer('shibing624/text2vec-base-chinese')
Out[ ]:
In [ ]:
%%writefile ~/.cache/torch/sentence_transformers/shibing624_text2vec-base-chinese/modules.json
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
}
]
In [ ]:
%%writefile ~/.cache/torch/sentence_transformers/shibing624_text2vec-base-chinese/sentence_bert_config.json
{
"max_seq_length": 128,
"do_lower_case": false
}
In [ ]:
%%bash
mkdir -p ~/.cache/torch/sentence_transformers/shibing624_text2vec-base-chinese/1_Pooling
In [ ]:
%%writefile ~/.cache/torch/sentence_transformers/shibing624_text2vec-base-chinese/1_Pooling/config.json
{
"word_embedding_dimension": 768,
"pooling_mode_mean_tokens": true
}
In [ ]:
%%bash
ls -lah ~/.cache/torch/sentence_transformers/shibing624_text2vec-base-chinese
CoSENT hfl/chinese-lert-large
In [ ]:
SentenceTransformer('GanymedeNil/text2vec-large-chinese')
Out[ ]:
In [ ]:
%%writefile ~/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese/modules.json
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
}
]
In [ ]:
%%writefile ~/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese/sentence_bert_config.json
{
"max_seq_length": 128,
"do_lower_case": false
}
In [ ]:
%%bash
mkdir -p ~/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese/1_Pooling
In [ ]:
%%writefile ~/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese/1_Pooling/config.json
{
"word_embedding_dimension": 1024,
"pooling_mode_mean_tokens": true
}
In [ ]:
%%bash
ls -lah ~/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese
In [ ]:
%%bash
du -sh ~/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese
Transformer and Indexer
In [ ]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('GanymedeNil/text2vec-large-chinese')
In [ ]:
import faiss
indexer = faiss.IndexFlatL2(1024)
Preparing Text
In [ ]:
from langchain.document_loaders import DirectoryLoader, BSHTMLLoader
loader = DirectoryLoader('_morning/htm', loader_cls=BSHTMLLoader)
raw_documents = loader.load()
In [ ]:
import re
for raw_document in raw_documents:
raw_document.page_content = re.sub(r'\n+', '\n', raw_document.page_content.replace('。', '。\n'))
Embedding
In [ ]:
sentences = None
In [ ]:
num_words = 20
In [ ]:
def embed(text):
global sentences
# words = extra_text.split(' ')
# sentences = [words[i: i+num_words] for i in range(0, len(words), num_words)]
# sentences = [' '.join(word_list) for word_list in sentences]
import re
sentences = []
for i, sentence in enumerate(re.split(r'\n+', text.replace('。', '。\n'))):
sentence = sentence.strip()
if sentence != '':
sentences.append(sentence)
print('\nNumber of Sentences:', len(sentences))
# print(sentences)
print('\nEmbedding the sentences...')
return model.encode(sentences)
In [ ]:
embeddings = embed(''.join([raw_document.page_content for raw_document in raw_documents]))
In [ ]:
len(embeddings[0])
Out[ ]:
Loading Faiss Indexer from Disk
In [ ]:
import os, pickle
In [ ]:
if os.path.exists(os.path.expanduser('~/lert_indexing_morning.pkl')):
# load vectorstore
with open(os.path.expanduser('~/lert_indexing_morning.pkl'), 'rb') as f:
indexer = pickle.load(f)
Indexing
In [ ]:
print('\nBuilding the index...')
indexer.add(embeddings)
print('\nindex.ntotal:', indexer.ntotal)
Saving Faiss Indexer to Disk
In [ ]:
if not os.path.exists(os.path.expanduser('~/lert_indexing_morning.pkl')):
# save vectorstore
with open(os.path.expanduser('~/lert_indexing_morning.pkl'), 'wb') as f:
pickle.dump(indexer, f)
In [ ]:
%%bash
ls -lah ~/lert_indexing_morning.pkl
Searching
In [ ]:
k = 20
In [ ]:
def retrieve(text):
print('\nSearching for:', text)
xq = model.encode([text])
D, I = indexer.search(xq, k)
print('Distance:', D[0])
print('Index:', I[0])
print('\nRetrieving related information...')
result = ''
for i in I[0]:
try:
result += sentences[i] + '\n'
except:
print(len(sentences), i)
print('\nresult:', result)
return result
In [ ]:
retrieve('你知道什么?')
retrieve('第八周讲了什么?')
retrieve('七倍加强的灵是什么?')
Out[ ]:
In [ ]:
retrieve('七倍加强的灵是什么?')
retrieve('问题:七倍加强的灵是什么?')
retrieve('用户提供了一段文本片段,但没有明确说明文档的主题。\n问题:七倍加强的灵是什么?')
Out[ ]:
Chat with PDF
In [ ]:
messages_in_english = [{
'role': 'system', 'content': 'You are an AI agent that summarizes chat in less than three setences.'
}]
In [ ]:
messages_in_chinese = [{
'role': '系统', 'content': '你是一个 AI 代理。请用中文在三句话之内概括聊天内容。'
}]
In [ ]:
chats_in_english = [{
'role': 'system', 'content': 'You are an AI assistant providing helpful advice.\n' + \
'You are given the following extracted parts of a long document and a question.\n' + \
'Provide a conversational answer based on the context provided.\n' + \
'You should only provide hyperlinks that reference the context below.\n' + \
'Do NOT make up hyperlinks.\n' + \
'If you can\'t find the answer in the context below, use your prior knowledge,\n' + \
'but in most of the cases the answer will be in the context.\n' + \
# 'If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.\n' + \
'Answer in Markdown format.\n'
}]
In [ ]:
chats_in_chinese = [{
'role': '系统', 'content': '你是一个提供有用建议的 AI 助手。\n' + \
'你被提供了一份长文档的一部分(额外信息)和一个问题。\n' + \
'请根据我所提供的文本提供会话式的回答。\n' + \
'你只应该提供与下面的文本相关的超链接。\n' + \
'**不要**编造超链接。\n' + \
'如果在下面的文本中找不到答案,可以使用你先前所知道的知识,\n' + \
'但在大多数情况下,答案是在文本中的。\n' + \
# '如果问题与上下文不相关,请礼貌地回复您只回答与上下文相关的问题。\n' + \
'请用中文以 Markdown 格式回答。\n'
}]
In [ ]:
import PyPDF2
def extract_text(pdf_file):
'''Extract text from a PDF file.'''
with open(pdf_file.name, 'rb') as f:
return '\n\n'.join([page.extract_text() for page in PyPDF2.PdfReader(f).pages])
In [ ]:
def build_the_bot(pdf_file, openai_key=None):
'''split sentences in chinese'''
print('OpenAI Key:', openai_key)
pdf_content = extract_text(pdf_file)
print('\nText Length:', len(extra_text))
print('\nBuilding the index...')
indexer.add(embed(pdf_content))
print('\nindex.ntotal:', indexer.ntotal)
return pdf_content
In [ ]:
import openai
In [ ]:
def chat(chat_history, user_input):
'''chat in chinese'''
global sentences
print('\nmessages_in_chinese:', messages_in_chinese)
# messages_in_english.append({'role': 'user', 'content': 'Question:\n' + user_input})
# print('\nmessages_in_english:', messages_in_english)
print('\nSummarizing the chat history...')
completion = openai.ChatCompletion.create(
model = 'gpt-3.5-turbo',
temperature = 0,
messages = messages_in_chinese
)
summary = completion.choices[0].message.content
print(f'\nSummarized Histoy: {summary}')
extra_info = retrieve(summary + '\n\n' + '问题:' + user_input)
chats_in_chinese.append({'role': '用户', 'content': '额外信息:\n' + extra_info + '\n\n' + '问题:' + user_input})
print('\nchats_in_chinese:', chats_in_chinese)
completion = openai.ChatCompletion.create(
model = 'gpt-3.5-turbo',
temperature = 0,
messages = chats_in_chinese[:1] + chats_in_chinese[-1:]
)
chat_output = completion.choices[0].message.content
print(f'\nChatGPT: {chat_output}')
# messages_in_chinese.append({'role': '用户', 'content': user_input})
# messages_in_chinese.append({'role': '助手', 'content': chat_output})
yield chat_history + [(user_input, chat_output)]
In [ ]:
import gradio
In [ ]:
def test_demo(mock_openai):
with gradio.Blocks() as demo:
gradio.Markdown('Chat with a PDF document')
with gradio.Tab('Select PDF'):
pdf = gradio.File()
openai_key = gradio.Textbox(label='OpenAI API Key',)
text_output = gradio.Textbox(label='PDF content')
text_button = gradio.Button('Build the Bot!!!')
text_button.click(build_the_bot, [pdf, openai_key], text_output)
with gradio.Tab('Knowledge Bot'):
chatbot = gradio.Chatbot()
message = gradio.Textbox('What is this document about?')
message.submit(chat, [chatbot, message], chatbot)
demo.queue().launch(debug = True)
assert True
demo.close()
In [ ]:
from ipymock import do
from ipymock.browser import common, get_conversation, mock_openai
In [ ]:
common.conversation_id = ''
In [ ]:
do(
mock_openai=mock_openai,
test_demo=test_demo,
)
In [ ]:
from langchain.embeddings import HuggingFaceEmbeddings
HuggingFaceEmbeddings(model_name='GanymedeNil/text2vec-large-chinese')
Out[ ]: