Loading...
Loading...
Compare original and translation side by side
pip install beautifulsoup4 requests lxmlpip install beautifulsoup4 requests lxmlfrom bs4 import BeautifulSoup
import requestsfrom bs4 import BeautifulSoup
import requestsundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedimport reimport reundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedelement.parent
element.parents # Generator of all ancestorselement.parent
element.parents # 所有祖先元素的生成器undefinedundefinedelement.children # Direct children (generator)
list(element.children)
element.contents # Direct children (list)
element.descendants # All descendants (generator)element.children # 直接子元素(生成器)
list(element.children)
element.contents # 直接子元素(列表)
element.descendants # 所有后代元素(生成器)undefinedundefinedelement.next_sibling
element.previous_sibling
element.next_siblings # Generator
element.previous_siblings # Generatorelement.next_sibling
element.previous_sibling
element.next_siblings # 后续兄弟元素生成器
element.previous_siblings # 前序兄弟元素生成器undefinedundefineddef safe_text(element, selector, default=''):
"""Safely extract text from element."""
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def safe_attr(element, selector, attr, default=None):
"""Safely extract attribute from element."""
found = element.select_one(selector)
return found.get(attr, default) if found else defaultdef safe_text(element, selector, default=''):
"""安全提取元素中的文本。"""
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def safe_attr(element, selector, attr, default=None):
"""安全提取元素中的属性。"""
found = element.select_one(selector)
return found.get(attr, default) if found else defaultdef extract_table(table):
"""Extract table data as list of dictionaries."""
headers = [th.get_text(strip=True) for th in table.select('th')]
rows = []
for tr in table.select('tbody tr'):
cells = [td.get_text(strip=True) for td in tr.select('td')]
if cells:
rows.append(dict(zip(headers, cells)))
return rowsdef extract_table(table):
"""将表格数据提取为字典列表。"""
headers = [th.get_text(strip=True) for th in table.select('th')]
rows = []
for tr in table.select('tbody tr'):
cells = [td.get_text(strip=True) for td in tr.select('td')]
if cells:
rows.append(dict(zip(headers, cells)))
return rowsdef extract_items(soup, selector, extractor):
"""Extract multiple items using a custom extractor function."""
return [extractor(item) for item in soup.select(selector)]def extract_items(soup, selector, extractor):
"""使用自定义提取器函数提取多个条目。"""
return [extractor(item) for item in soup.select(selector)]undefinedundefinedfrom urllib.parse import urljoin
def resolve_url(base_url, relative_url):
"""Convert relative URL to absolute."""
if not relative_url:
return None
return urljoin(base_url, relative_url)from urllib.parse import urljoin
def resolve_url(base_url, relative_url):
"""将相对URL转换为绝对URL。"""
if not relative_url:
return None
return urljoin(base_url, relative_url)undefinedundefinedundefinedundefinedundefinedundefinedimport requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
class ProductScraper:
def __init__(self, base_url):
self.base_url = base_url
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)'
})
def fetch_page(self, url):
"""Fetch and parse a page."""
response = self.session.get(url, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'lxml')
def extract_product(self, item):
"""Extract product data from a card element."""
return {
'name': self._safe_text(item, '.product-title'),
'price': self._parse_price(item.select_one('.price')),
'rating': self._safe_attr(item, '.rating', 'data-rating'),
'image': self._resolve(self._safe_attr(item, 'img', 'src')),
'url': self._resolve(self._safe_attr(item, 'a', 'href')),
'in_stock': not item.select_one('.out-of-stock')
}
def scrape_products(self, url):
"""Scrape all products from a page."""
soup = self.fetch_page(url)
items = soup.select('.product-card')
return [self.extract_product(item) for item in items]
def _safe_text(self, element, selector, default=''):
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def _safe_attr(self, element, selector, attr, default=None):
found = element.select_one(selector)
return found.get(attr, default) if found else default
def _parse_price(self, element):
if not element:
return None
text = element.get_text(strip=True)
try:
return float(text.replace('$', '').replace(',', ''))
except ValueError:
return None
def _resolve(self, url):
return urljoin(self.base_url, url) if url else Noneimport requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
class ProductScraper:
def __init__(self, base_url):
self.base_url = base_url
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)'
})
def fetch_page(self, url):
"""抓取并解析页面。"""
response = self.session.get(url, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'lxml')
def extract_product(self, item):
"""从卡片元素中提取产品数据。"""
return {
'name': self._safe_text(item, '.product-title'),
'price': self._parse_price(item.select_one('.price')),
'rating': self._safe_attr(item, '.rating', 'data-rating'),
'image': self._resolve(self._safe_attr(item, 'img', 'src')),
'url': self._resolve(self._safe_attr(item, 'a', 'href')),
'in_stock': not item.select_one('.out-of-stock')
}
def scrape_products(self, url):
"""抓取页面上的所有产品。"""
soup = self.fetch_page(url)
items = soup.select('.product-card')
return [self.extract_product(item) for item in items]
def _safe_text(self, element, selector, default=''):
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def _safe_attr(self, element, selector, attr, default=None):
found = element.select_one(selector)
return found.get(attr, default) if found else default
def _parse_price(self, element):
if not element:
return None
text = element.get_text(strip=True)
try:
return float(text.replace('$', '').replace(',', ''))
except ValueError:
return None
def _resolve(self, url):
return urljoin(self.base_url, url) if url else Noneundefinedundefinedundefinedundefinedundefinedundefinedselect()select_one()get_text(strip=True)select()select_one()get_text(strip=True)