The proxy rotation module provides utilities for searching, validating, and managing proxy servers for web scraping operations.
Type Definitions
ProxyBrokerCriteria
class ProxyBrokerCriteria(TypedDict, total=False):
anonymous: bool
countryset: Set[str]
secure: bool
timeout: float
search_outside_if_empty: bool
Criteria for searching proxy servers.
ProxySettings
class ProxySettings(TypedDict, total=False):
server: str
bypass: str
username: str
password: str
Proxy server configuration settings.
Proxy
class Proxy(ProxySettings):
criteria: ProxyBrokerCriteria
Complete proxy configuration including server settings and search criteria.
Functions
search_proxy_servers
search_proxy_servers(
anonymous: bool = True,
countryset: Optional[Set[str]] = None,
secure: bool = False,
timeout: float = 5.0,
max_shape: int = 5,
search_outside_if_empty: bool = True,
) -> List[str]
Search for proxy servers that match the specified broker criteria.
Whether proxy servers should have minimum level-1 anonymity.
countryset
Optional[Set[str]]
default:"None"
Set of country codes for admissible proxy server locations (e.g., ).
Whether proxy servers should support HTTPS. Defaults to HTTP.
The maximum timeout for proxy responses in seconds.
The maximum number of proxy servers to return.
Whether to extend the countryset search if no proxies are found in specified countries.
Example: Basic Usage
from scrapegraphai.utils import search_proxy_servers
# Search for 3 anonymous proxies
proxies = search_proxy_servers(
anonymous=True,
max_shape=3,
timeout=5.0
)
print(f"Found {len(proxies)} proxies:")
for proxy in proxies:
print(f" - {proxy}")
Example: Country-Specific Proxies
from scrapegraphai.utils import search_proxy_servers
# Search for proxies in specific countries
proxies = search_proxy_servers(
anonymous=True,
countryset={"US", "GB", "CA"},
secure=True, # HTTPS proxies
timeout=3.0,
max_shape=5
)
for proxy in proxies:
print(proxy)
Example: With ScrapeGraphAI
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import search_proxy_servers
# Get a list of proxies
proxies = search_proxy_servers(
anonymous=True,
countryset={"US"},
max_shape=1
)
# Use the first proxy in your scraper configuration
graph_config = {
"llm": {"model": "openai/gpt-4o-mini"},
"loader_kwargs": {
"proxy": {
"server": proxies[0],
}
}
}
smart_scraper = SmartScraperGraph(
prompt="Extract product information",
source="https://example.com",
config=graph_config,
)
result = smart_scraper.run()
parse_or_search_proxy
parse_or_search_proxy(proxy: Proxy) -> ProxySettings
Parses a proxy configuration or searches for a matching one via broker. This function automatically determines whether to parse an existing proxy configuration or search for a new one.
The proxy configuration to parse or search for. Must include a “server” field.
A Playwright-compliant proxy configuration ready to use.
Example: Parse Existing Proxy
from scrapegraphai.utils import parse_or_search_proxy
# Parse a known proxy server
proxy_config = {
"server": "http://103.10.63.135:8080",
"username": "myuser",
"password": "mypass"
}
parsed_proxy = parse_or_search_proxy(proxy_config)
print(parsed_proxy)
# Output: {'server': 'http://103.10.63.135:8080', 'username': 'myuser', 'password': 'mypass'}
Example: Search for Proxy via Broker
from scrapegraphai.utils import parse_or_search_proxy
# Search for a proxy using broker criteria
proxy_config = {
"server": "broker",
"criteria": {
"anonymous": True,
"countryset": {"US", "GB"},
"secure": True,
"timeout": 5.0
}
}
proxy_settings = parse_or_search_proxy(proxy_config)
print(proxy_settings)
# Output: {'server': 'http://123.45.67.89:8080'}
is_ipv4_address
is_ipv4_address(address: str) -> bool
Check if a given address conforms to a valid IPv4 address format.
The address string to validate.
True if the address is a valid IPv4 address, False otherwise.
Example
from scrapegraphai.utils import is_ipv4_address
print(is_ipv4_address("192.168.1.1")) # True
print(is_ipv4_address("256.1.1.1")) # False
print(is_ipv4_address("example.com")) # False
print(is_ipv4_address("103.10.63.135")) # True
Complete Example: Rotating Proxies
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import search_proxy_servers
import random
# Get a pool of proxies
proxy_pool = search_proxy_servers(
anonymous=True,
countryset={"US", "GB", "CA"},
secure=True,
max_shape=10,
timeout=5.0
)
print(f"Proxy pool contains {len(proxy_pool)} proxies")
# Function to scrape with random proxy
def scrape_with_rotation(url: str, prompt: str):
# Select a random proxy from the pool
proxy = random.choice(proxy_pool)
graph_config = {
"llm": {"model": "openai/gpt-4o-mini"},
"loader_kwargs": {
"proxy": {"server": proxy}
}
}
smart_scraper = SmartScraperGraph(
prompt=prompt,
source=url,
config=graph_config,
)
print(f"Using proxy: {proxy}")
return smart_scraper.run()
# Scrape multiple pages with rotation
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
]
results = []
for url in urls:
result = scrape_with_rotation(url, "Extract main content")
results.append(result)
print(f"Scraped {url}")
print(f"Total results: {len(results)}")
Error Handling
from scrapegraphai.utils import search_proxy_servers
from fp.errors import FreeProxyException
try:
proxies = search_proxy_servers(
anonymous=True,
countryset={"XX"}, # Invalid country code
max_shape=5,
search_outside_if_empty=False # Don't search outside
)
except FreeProxyException as e:
print(f"Failed to find proxies: {e}")
# Fallback: try without country restriction
proxies = search_proxy_servers(
anonymous=True,
max_shape=5
)
Best Practices
-
Proxy Pool Management: Create a pool of proxies and rotate through them to avoid rate limiting.
-
Timeout Configuration: Set appropriate timeouts based on your needs:
# Fast proxies for quick operations
proxies = search_proxy_servers(timeout=2.0)
# More lenient for slower connections
proxies = search_proxy_servers(timeout=10.0)
-
Authentication: Always provide username and password together:
proxy_config = {
"server": "http://proxy.example.com:8080",
"username": "user",
"password": "pass" # Both must be provided
}
-
Country Selection: Choose countries closer to your target website for better performance:
# For US-based websites
proxies = search_proxy_servers(countryset={"US", "CA"})
# For EU-based websites
proxies = search_proxy_servers(countryset={"GB", "DE", "FR"})
-
Secure Proxies: Use HTTPS proxies for sensitive operations:
proxies = search_proxy_servers(secure=True, anonymous=True)