Replies: 11 comments 3 replies
-
Beta Was this translation helpful? Give feedback.
-
Beta Was this translation helpful? Give feedback.
-
进阶作业: 已搭建面向机器人领域的专业问答助手,但未在OpenXLab部署。 分别执行如下代码以构建助手。 构建机器人论文数据集: download_paper.py import requests
import re
from tqdm import tqdm
import os
from bs4 import BeautifulSoup
# 下载Markdown文件
url = 'https://github.com/GT-RIPL/Awesome-LLM-Robotics/blob/main/README.md' # 替换成你的GitHub文件链接
response = requests.get(url)
markdown_text = response.text
# 使用正则表达式匹配指定格式的网址
urls = re.findall(r'https://arxiv\.org/abs/[0-9]+\.[0-9]+', markdown_text)
def get_arxiv_paper_title(paper_id):
base_url = f'https://arxiv.org/abs/{paper_id}'
response = requests.get(base_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
possible_titles = soup.find_all(['title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
for tag in possible_titles:
if 'title' in tag.name.lower() or 'title' in tag.get('class', []):
return tag.get_text().strip()
return None
def download_arxiv_paper(paper_id, output_folder):
paper_title = get_arxiv_paper_title(paper_id)
if paper_title:
paper_title = re.sub('[^\w\-_\. ]', '_', paper_title) # 去除非法字符
download_url = f'https://arxiv.org/pdf/{paper_id}.pdf'
output_path = f"{paper_title.strip()}.pdf"
output_path = output_folder + output_path
# 检查文件是否已存在,若存在则跳过
if os.path.exists(output_path):
print(f"File '{output_path}' already exists. Skipping download.")
return
response = requests.get(download_url, stream=True, allow_redirects=True)
if response.status_code == 200:
with open(output_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
file.write(chunk)
print(f"Paper {paper_id} downloaded successfully as '{output_path}'!")
else:
print(f"Failed to download paper {paper_id}.")
else:
print(f"Paper {paper_id} title not found.")
output_folder = "./paper/"
if not os.path.exists(output_folder):
os.mkdir(output_folder)
success_num = 0
# 下载文章
for url in tqdm(urls):
print(url[-10:])
paper_id = url[-10:] # ArXiv 论文的标识号
try:
download_arxiv_paper(paper_id, output_folder)
success_num += 1
except:
print("download err")
print("success_num:", success_num) 创建本地知识库: create_db.py # 首先导入所需第三方库
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
import os
# 获取文件路径函数
def get_pdf_files(dir_path):
# args:dir_path,目标文件夹路径
file_list = []
for filepath, dirnames, filenames in os.walk(dir_path):
# os.walk 函数将递归遍历指定文件夹
for filename in filenames:
# 通过后缀名判断文件类型是否满足要求
if filename.endswith(".pdf"):
# 如果满足要求,将其绝对路径加入到结果列表
file_list.append(os.path.join(filepath, filename))
# elif filename.endswith(".txt"):
# file_list.append(os.path.join(filepath, filename))
return file_list
# 加载文件函数
def get_text(dir_path):
# args:dir_path,目标文件夹路径
# 首先调用上文定义的函数得到目标文件路径列表
file_lst = get_pdf_files(dir_path)
# docs 存放加载之后的纯文本对象
docs = []
# 遍历所有目标文件
for one_file in tqdm(file_lst):
file_type = one_file.split('.')[-1]
if file_type == 'md':
loader = UnstructuredMarkdownLoader(one_file)
elif file_type == 'txt':
loader = UnstructuredFileLoader(one_file)
elif file_type == 'pdf':
loader = PyPDFLoader(one_file)
else:
# 如果是不符合条件的文件,直接跳过
continue
docs.extend(loader.load())
return docs
# 目标文件夹
tar_dir = [
"/root/data/ALLMR_demo/paper"
]
# 加载目标文件
docs = []
for dir_path in tar_dir:
docs.extend(get_text(dir_path))
# 对文本进行分块
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=150)
split_docs = text_splitter.split_documents(docs)
# 加载开源词向量模型
embeddings = HuggingFaceEmbeddings(model_name="/root/data/model/sentence-transformer")
# 构建向量数据库
# 定义持久化路径
persist_directory = 'data_base/vector_db/chroma'
# 加载数据库
vectordb = Chroma.from_documents(
documents=split_docs,
embedding=embeddings,
persist_directory=persist_directory # 允许我们将persist_directory目录保存到磁盘上
)
# 将加载的向量数据库持久化到磁盘上
vectordb.persist() 用于运行助手的 demo.py 文件和大模型定义文件 LLM.py 与教程一致,不再赘述 |
Beta Was this translation helpful? Give feedback.
-
第3节作业:https://github.com/GitEasonXu/InternLM/blob/main/homework/course_3/coursework.md |
Beta Was this translation helpful? Give feedback.
-
https://blog.csdn.net/preor/article/details/135514158?spm=1001.2014.3001.5501 |
Beta Was this translation helpful? Give feedback.
-
https://github.com/ATang0729/InternLMCamp_Notes/blob/main/notes/class3-HW/HW_class3.md |
Beta Was this translation helpful? Give feedback.
-
https://github.com/DreamBinary/InternLMCamp/tree/master/day03/work |
Beta Was this translation helpful? Give feedback.
-
https://blog.csdn.net/m0_75085274/article/details/135511954?spm=1001.2014.3001.5502 |
Beta Was this translation helpful? Give feedback.
-
https://github.com/8baby8/Farewell_Learn_puyu/blob/main/%E7%AC%AC%E4%B8%89%E8%8A%82%E8%AF%BE/task.md |
Beta Was this translation helpful? Give feedback.
-
Beta Was this translation helpful? Give feedback.
-
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
-
这里存放你的第三节课作业~
Beta Was this translation helpful? Give feedback.
All reactions