Skip to content

Examples

This page contains some more advanced and realistic examples of using Unparallel.

Query (all) posts of a WordPress site

```python notest """ This script uses the WordPress API to query (all) posts of the website https://techcrunch.com

See Also: https://developer.wordpress.org/rest-api/reference/posts/ """ import asyncio from pprint import pp

import httpx

from unparallel import up from unparallel.unparallel import RequestError

async def main(): page_size = 20 base_url = "https://techcrunch.com/wp-json/wp/v2" pagination_url = f"/posts?per_page={page_size}"

# Get page count
page_size = 20
response = httpx.head(base_url + pagination_url)
total_pages = int(response.headers["X-WP-TotalPages"])
print(f"Website '{base_url}' has {total_pages} pages (page size = {page_size})")

# Comment the line below to get all pages. Note that you might have to adjust
# the settings for this to work without errors. For me, it worked using
# max_connections=800 and timeout=60.

total_pages = min(total_pages, 1000)

# Get all pages and flatten the result
paths = [f"{pagination_url}&page={i}" for i in range(1, total_pages + 1)]
results = await up(paths, method="GET", base_url=base_url, flatten_result=True)

# Check if some requests failed
valid_results = [item for item in results if not isinstance(item, RequestError)]
fails = len(results) - len(valid_results)
print(f"{fails=} ({fails/len(results):.2%})")

# Display some properties of the first 5 posts
intersting_keys = ["id", "date", "slug", "title", "author"]
pp(
    [
        {k: v for k, v in item.items() if k in intersting_keys}
        for item in valid_results[:5]
    ]
)

if name == "main": asyncio.run(main())

If you run the example, it should print something like the following:
Website 'https://techcrunch.com/wp-json/wp/v2' has 12202 pages (page size = 20) Making async requests: 100%|███████████| 1000/1000 [00:13<00:00, 74.30it/s] [{'id': 2642913, 'date': '2023-12-26T07:05:21', 'slug': 'vcs-are-entering-2024-with-healthy-paranoia', 'title': {'rendered': 'VCs are entering 2024 with ‘healthy ' 'paranoia’'}, 'author': 428363}, {'id': 2645233, 'date': '2023-12-26T06:35:00', 'slug': 'what-vcs-are-looking-for-in-the-next-wave-of-cybersecurity-startups', 'title': {'rendered': 'What VCs are looking for in the next wave of ' 'cybersecurity startups'}, 'author': 133574551}, {'id': 2641499, 'date': '2023-12-26T06:05:55', 'slug': 'hackers-stole-2-billion-in-crypto-in-2023-data-shows', 'title': {'rendered': 'Hackers stole $2 billion in crypto in 2023, data ' 'shows'}, 'author': 133574594}, {'id': 2635851, 'date': '2023-12-26T05:05:28', 'slug': 'the-eternal-struggle-between-open-source-and-proprietary-software', 'title': {'rendered': 'The eternal struggle between open source and ' 'proprietary software'}, 'author': 133574560}, {'id': 2645355, 'date': '2023-12-26T03:52:55', 'slug': 'nonprofit-code-org-sues-byjus-unit-whitehat-jr-over-payment-dues', 'title': {'rendered': 'Nonprofit Code.org sues Byju’s unit WhiteHat Jr ' 'over payment dues'}, 'author': 133574269}]
## Fetch the content of multiple websites
```python notest
"""
This script is based on a use case at StackOverflow and fetches the content of various 
webpages.

See Also:
    https://www.stackoverflow.com/a/57129241
"""
import asyncio

from unparallel import up
from unparallel.unparallel import RequestError

# Extracted from https://en.wikipedia.org/wiki/List_of_most-visited_websites
websites = """https://www.google.com/
https://www.youtube.com/
https://www.facebook.com/
https://www.wikipedia.org/
https://www.instagram.com/
https://www.reddit.com/
https://www.amazon.com/
https://www.duckduckgo.com/
https://www.yahoo.com/
https://www.tiktok.com/
https://www.bing.com/
https://www.yahoo.co.jp/
https://www.weather.com/
https://www.whatsapp.com/
https://www.yandex.ru/
https://www.openai.com/
https://www.live.com/
https://www.microsoft.com/
https://www.linkedin.com/
https://www.quora.com/
https://www.twitch.tv/
https://www.naver.com/
https://www.netflix.com/
https://www.office.com/
https://www.vk.com/
https://www.globo.com/
https://www.Aliexpress.com/
https://www.cnn.com/
https://www.zoom.us/
https://www.imdb.com/
https://www.x.com/
https://www.nytimes.com/
https://www.espn.com/
https://www.amazon.co.jp/
https://www.pinterest.com/
https://www.uol.com.br/
https://www.ebay.com/
https://www.marca.com/
https://www.canva.com/
https://www.spotify.com/
https://www.bbc.com/
https://www.paypal.com/
https://www.apple.com/"""


async def main():
    urls = websites.split("\n")

    # Get all pages
    results = await up(
        urls, method="GET", response_fn=lambda x: x.text, raise_for_status=False
    )

    # Print the content of the first 5 pages
    for url, content in zip(urls, results[:5]):
        print(url, repr(content[:100]))


results = asyncio.run(main())

If you run the example, it should print something like the following:

Making async requests: 100%|███████████| 43/43 [00:03<00:00, 11.60it/s]
https://www.google.com/ '<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="de-AT"><head><meta cont'
https://www.youtube.com/ '<!DOCTYPE html><html style="font-size: 10px;font-family: Roboto, Arial, sans-serif;" lang="de-DE" da'
https://www.facebook.com/ '<!DOCTYPE html>\n<html lang="de" id="facebook" class="no_js">\n<head><meta charset="utf-8" /><meta nam'
https://www.wikipedia.org/ '<!DOCTYPE html>\n<html lang="en" class="no-js">\n<head>\n<meta charset="utf-8">\n<title>Wikipedia</title'
https://www.instagram.com/ '<!DOCTYPE html><html class="_9dls" lang="en" dir="ltr"><head><link data-default-icon="https://static'