Example: Web Scraping¶

Complete scraping of the quotes.toscrape.com site using PyWebFlx.

The workflow¶

inspect() -- understand the page
inspect(".quote") -- view internal structure
extract_data() -- extract using the discovered selectors

Full code¶

import asyncio
from pywebflx import use_browser, configure_logging

configure_logging(level="INFO")

async def main():
    async with use_browser(url="https://quotes.toscrape.com/") as browser:

        # 1. General inspect -- discovers .quote x 10
        page = await browser.inspect(depth=5, samples=1)
        print(page)
        # <div.quote> x 10 items
        #   sample[0]: "A reader lives a thousand lives..."

        # 2. Focused inspect -- discovers .text, .author, .tags
        quote = await browser.inspect(".quote", depth=5, samples=1)
        print(quote)
        # <span.text> "A reader lives..."
        # <small.author> "George R.R. Martin"
        # <a.tag> x 4 items

        # 3. Extract using the discovered selectors
        quotes = await browser.extract_data(
            container="body",
            row=".quote",
            columns={
                "text": ".text",
                "author": ".author",
                "tags": ".tags",
            }
        )

        for i, q in enumerate(quotes, 1):
            tags = [t.strip() for t in q['tags'].replace("Tags:", "").split("\n") if t.strip()]
            print(f"{i}. {q['author']}: {q['text'][:60]}...")
            print(f"   Tags: {', '.join(tags)}")

        print(f"\nTotal: {len(quotes)} quotes")

asyncio.run(main())

Output¶

1. George R.R. Martin: "A reader lives a thousand lives before he dies...
   Tags: read, readers, reading, reading-books
2. C.S. Lewis: "You can never get a cup of tea large enough or a book...
   Tags: books, tea
3. Marilyn Monroe: "You believe lies so you eventually learn to trust...
   Tags: lies, lying, trust
...
Total: 10 quotes

With pagination¶

async def scrape_all_pages():
    async with use_browser(url="https://quotes.toscrape.com/") as browser:
        all_quotes = []
        page = 1

        while True:
            print(f"Page {page}...")

            quotes = await browser.extract_data(
                container="body",
                row=".quote",
                columns={"text": ".text", "author": ".author"}
            )
            all_quotes.extend(quotes)

            # Check if there is a next page
            if not await browser.element_exists("li.next a"):
                break

            await browser.click("li.next a")
            await asyncio.sleep(1)
            page += 1

        print(f"Total: {len(all_quotes)} quotes across {page} pages")
        return all_quotes

Export to CSV¶

import pandas as pd

quotes = await scrape_all_pages()
df = pd.DataFrame(quotes)
df.to_csv("quotes.csv", index=False)

Export to JSON¶

import json

with open("quotes.json", "w") as f:
    json.dump(quotes, f, indent=2, ensure_ascii=False)