Example: Web Scraping¶
Complete scraping of the quotes.toscrape.com site using PyWebFlx.
The workflow¶
inspect()-- understand the pageinspect(".quote")-- view internal structureextract_data()-- extract using the discovered selectors
Full code¶
import asyncio
from pywebflx import use_browser, configure_logging
configure_logging(level="INFO")
async def main():
async with use_browser(url="https://quotes.toscrape.com/") as browser:
# 1. General inspect -- discovers .quote x 10
page = await browser.inspect(depth=5, samples=1)
print(page)
# <div.quote> x 10 items
# sample[0]: "A reader lives a thousand lives..."
# 2. Focused inspect -- discovers .text, .author, .tags
quote = await browser.inspect(".quote", depth=5, samples=1)
print(quote)
# <span.text> "A reader lives..."
# <small.author> "George R.R. Martin"
# <a.tag> x 4 items
# 3. Extract using the discovered selectors
quotes = await browser.extract_data(
container="body",
row=".quote",
columns={
"text": ".text",
"author": ".author",
"tags": ".tags",
}
)
for i, q in enumerate(quotes, 1):
tags = [t.strip() for t in q['tags'].replace("Tags:", "").split("\n") if t.strip()]
print(f"{i}. {q['author']}: {q['text'][:60]}...")
print(f" Tags: {', '.join(tags)}")
print(f"\nTotal: {len(quotes)} quotes")
asyncio.run(main())
Output¶
1. George R.R. Martin: "A reader lives a thousand lives before he dies...
Tags: read, readers, reading, reading-books
2. C.S. Lewis: "You can never get a cup of tea large enough or a book...
Tags: books, tea
3. Marilyn Monroe: "You believe lies so you eventually learn to trust...
Tags: lies, lying, trust
...
Total: 10 quotes
With pagination¶
async def scrape_all_pages():
async with use_browser(url="https://quotes.toscrape.com/") as browser:
all_quotes = []
page = 1
while True:
print(f"Page {page}...")
quotes = await browser.extract_data(
container="body",
row=".quote",
columns={"text": ".text", "author": ".author"}
)
all_quotes.extend(quotes)
# Check if there is a next page
if not await browser.element_exists("li.next a"):
break
await browser.click("li.next a")
await asyncio.sleep(1)
page += 1
print(f"Total: {len(all_quotes)} quotes across {page} pages")
return all_quotes
Export to CSV¶
import pandas as pd
quotes = await scrape_all_pages()
df = pd.DataFrame(quotes)
df.to_csv("quotes.csv", index=False)