Replit help Python

Code:

# Import libraries
import json
from PIL import Image 
import pytesseract
import spacy

# Document parser class
class DocParser:

    def __init__(self):
        self.nlp = spacy.load("en_core_web_lg") 

    def process_image(self, image_file):
        """OCR for scanned docs"""
        text = pytesseract.image_to_string(Image.open(image_file)) 
        return text

    def extract_entities(self, doc):
        """Extract structured data"""
        text = doc if isinstance(doc, str) else doc.text
        doc = self.nlp(text)
        entities = [(x.text, x.label_) for x in doc.ents]
        return entities

    def to_json(self, doc):
        """Convert original doc to JSON"""
        text = self.extract_text(doc)
        entities = self.extract_entities(text)  
        json_data = json.dumps({
            "document_text": text,
            "entities": entities
        })
        return json_data


parser = DocParser()
pdf_doc = "sample.pdf"
text = parser.process_image(pdf_doc)
json_data = parser.to_json(text)
print(json_data)

Hi @PaulApostolakis!
What do you need help in?

2 Likes