2. Document
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from dataclasses_json import DataClassJsonMixin
@dataclass
class BaseDocument(DataClassJsonMixin):
text: Optional[str] = None
doc_id: Optional[str] = None
embedding: Optional[List[float]] = None
extra_info: Optional[Dict[str, Any]] = None
3. DataConnectors
GPT Index ではデータ読み込み機能を提供する Reader, Parser が定義されている
3.1. Wikipedia からのテキスト読み込み
# 一部省略 & 改変
import wikipedia
class WikipediaReader(BaseReader):
def load_data(self, pages:List[str], **kwargs) -> List[Document]:
documents = []
for page in pages:
content = wikipedia.page(page, **kwargs).content
documents.append(Document(content))
return documents
3.2. WEB からのテキスト読み込み
# 一部省略 & 改変
import requests
import html2text
class SimpleWebPageReader(BaseReader):
def __init__(self, html_to_text:bool=False):
self._html_to_text = html_to_text
def load_data(self, urls:List[str]) -> List[Document]:
documents = []
for url in urls:
response = requests.get(url).text
if self._html_to_text:
response = html2text.html2text(response)
documents.append(Document(response))
return documents
他にも Trafilatura, BeautifulSoup4 や feedparser を用いた RSS フィードの読み込み クラスが定義されている
3.3. 文書画像からのテキスト読み込み
# 一部省略 & 改変
from pathlib import Path
from PIL import Image
from transformers import DonutProcessor, VisionEncoderDecoderModel
class ImageParser(BaseParser):
def _init_parser(self):
name = "naver-clova-ix/donut-base-finetuned-cord-v2"
model = VisionEncoderDecoderModel.from_pretrained(name)
processor = DonutProcessor.from_pretrained(name)
return {
"processor": processor,
"model": model,
}
def parse_file(self, file:Path) -> str:
image = Image.open(file).convert("RGB")
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(
task_prompt,
add_special_tokens=False,
return_tensors="pt"
).input_ids
pixel_values = processor(
image,
return_tensors="pt"
).pixel_values
outputs = model.generate(
pixel_values.to(device),
decoder_input_ids=decoder_input_ids.to(device),
# 中略
)
sequence = processor.batch_decode(outputs.sequences)[0]
# 中略
return sequence
3.4. スライドからのテキスト読み込み
python-pptx を用いてスライドを読み込み
transformers の nlpconnect/vit-gpt2-image-captioning を用いてキャプション生成
# 一部省略 & 改変
from pathlib import Path
from pptx import Presentation
from transformers import (
AutoTokenizer,
VisionEncoderDecoderModel,
ViTFeatureExtractor,
)
class PptxParser(BaseParser):
def _init_parser(self):
name = "nlpconnect/vit-gpt2-image-captioning"
model = VisionEncoderDecoderModel.from_pretrained(name)
feat_extractor = ViTFeatureExtractor.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained(name)
return {
"feature_extractor": feat_extractor,
"model": model,
"tokenizer": tokenizer
}
def caption_image(self, tmp_image_file:str) -> str:
# 中略
return caption
def parse_file(self, file:Path, urls:List[str]) -> str:
presentation = Presentation(file)
result = ""
for i, slide in enumerate(presentation.slides):
result += f"\n\nSlide ${i}: \n"
for shape in slide.shapes:
image = shape.image
image_file = f"tmp_image.{image.ext}"
caption = self.caption_image(image_file)
result += f"\n Image: {caption}\n\n"
return result
Comments