Extract data from CSV files, PDFs, and text documents
ScrapeGraphAI can process local documents including CSV files, PDFs, text files, and more. This is perfect for extracting structured information from your existing documents.
Extract structured data from CSV files using natural language:
import osfrom dotenv import load_dotenvfrom scrapegraphai.graphs import CSVScraperGraphfrom scrapegraphai.utils import prettify_exec_infoload_dotenv()# Read the CSV fileFILE_NAME = "inputs/username.csv"curr_dir = os.path.dirname(os.path.realpath(__file__))file_path = os.path.join(curr_dir, FILE_NAME)with open(file_path, "r") as file: text = file.read()# Define the configuration for the graphopenai_key = os.getenv("OPENAI_APIKEY")graph_config = { "llm": { "api_key": openai_key, "model": "openai/gpt-4o", },}# Create the CSVScraperGraph instance and run itcsv_scraper_graph = CSVScraperGraph( prompt="List me all the last names", source=str(text), # Pass the content of the file config=graph_config,)result = csv_scraper_graph.run()print(result)# Get graph execution infograph_exec_info = csv_scraper_graph.get_execution_info()print(prettify_exec_info(graph_exec_info))
Process text documents and extract structured information:
import jsonimport osfrom dotenv import load_dotenvfrom scrapegraphai.graphs import DocumentScraperGraphload_dotenv()openai_key = os.getenv("OPENAI_APIKEY")graph_config = { "llm": { "api_key": openai_key, "model": "openai/gpt-4o", }}source = """ The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian circa 1308/21 by Dante. It is usually held to be one of the world's great works of literature. Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God. Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love, the Beatrice of his earlier poetry, through the celestial spheres of Paradise."""pdf_scraper_graph = DocumentScraperGraph( prompt="Summarize the text and find the main topics", source=source, config=graph_config,)result = pdf_scraper_graph.run()print(json.dumps(result, indent=4))
from scrapegraphai.graphs import CSVScraperGraph# Read CSV contentwith open("data.csv", "r") as f: csv_content = f.read()graph = CSVScraperGraph( prompt="Extract all email addresses", source=csv_content, config=graph_config,)
Perfect for processing tabular data and spreadsheets.
from scrapegraphai.graphs import DocumentScraperGraph# Can be plain text, loaded from files, etc.document_text = """Your document content here..."""graph = DocumentScraperGraph( prompt="Extract key information", source=document_text, config=graph_config,)
Works with plain text, PDFs (as text), and other document formats.
from scrapegraphai.graphs import JSONScraperGraph# Read JSON contentwith open("data.json", "r") as f: json_content = f.read()graph = JSONScraperGraph( prompt="Find all products with price > 100", source=json_content, config=graph_config,)
Query JSON documents with natural language.
from scrapegraphai.graphs import XMLScraperGraph# Read XML contentwith open("data.xml", "r") as f: xml_content = f.read()graph = XMLScraperGraph( prompt="Extract all product names and prices", source=xml_content, config=graph_config,)
Parse XML documents without writing XPath queries.
For the Divine Comedy text with prompt “Summarize the text and find the main topics”:
{ "summary": "The Divine Comedy is a long narrative poem by Dante written circa 1308-1321, divided into three sections: Inferno, Purgatorio, and Paradiso. It traces Dante's journey from darkness to divine enlightenment.", "main_topics": [ "Italian literature", "Dante's spiritual journey", "Three realms: Hell, Purgatory, Paradise", "Guidance by Virgil, Statius, and Beatrice", "Medieval Christian theology" ], "key_figures": [ "Dante", "Virgil", "Statius", "Beatrice" ]}
from scrapegraphai.graphs import CSVScraperMultiGraphcsv_files = [ "data/sales_q1.csv", "data/sales_q2.csv", "data/sales_q3.csv",]csv_contents = []for file_path in csv_files: with open(file_path, "r") as f: csv_contents.append(f.read())multi_csv_graph = CSVScraperMultiGraph( prompt="Calculate total sales for each product", source=csv_contents, config=graph_config,)result = multi_csv_graph.run()print(result)