123 lines
5.1 KiB
Python
123 lines
5.1 KiB
Python
import os
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from markdownify import markdownify as md
|
|
import re
|
|
|
|
# Make sure you have the required libraries:
|
|
# pip install requests beautifulsoup4 markdownify
|
|
|
|
def sanitize_filename(url):
|
|
"""Creates a safe filename from a URL."""
|
|
if "docs.rs/" in url:
|
|
url = url.split("docs.rs/", 1)[1]
|
|
sanitized = re.sub(r'[^a-zA-Z0-9\.]+', '_', url)
|
|
return sanitized.strip('_').lower()[:100]
|
|
|
|
def scrape_and_convert_url(url, session):
|
|
"""
|
|
Fetches a single URL, finds its main documentation content,
|
|
and converts it to Markdown text.
|
|
"""
|
|
print(f"Fetching: {url}")
|
|
try:
|
|
response = session.get(url, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# --- THIS IS THE CORRECTED LOGIC ---
|
|
# First, try to find the main content by its ID, which is the most reliable selector.
|
|
main_content = soup.find(id="main-content")
|
|
|
|
# If that fails (which is unlikely), fall back to the class name as a safety measure.
|
|
if not main_content:
|
|
main_content = soup.find(class_="main-content")
|
|
# --- END OF CORRECTION ---
|
|
|
|
if main_content:
|
|
markdown_text = md(str(main_content), heading_style="ATX")
|
|
print(f" -> Success: Converted content.")
|
|
return markdown_text
|
|
else:
|
|
print(f" -> Warning: Could not find a recognizable main content section in page.")
|
|
return None
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" -> Error: Failed to fetch URL {url}. Reason: {e}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" -> Error: An unexpected error occurred for {url}. Reason: {e}")
|
|
return None
|
|
|
|
def process_url_list(urls, output_file):
|
|
"""
|
|
Processes a list of URLs, converts their content to Markdown,
|
|
and saves everything into a single output file.
|
|
"""
|
|
with requests.Session() as session:
|
|
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("# Iced Pocket Guide\n\n")
|
|
|
|
processed_urls = set()
|
|
for url in urls:
|
|
cleaned_url = url.split('#')[0]
|
|
if not cleaned_url or not cleaned_url.startswith("http") or cleaned_url in processed_urls:
|
|
if cleaned_url in processed_urls:
|
|
print(f"Skipping duplicate URL: {cleaned_url}")
|
|
else:
|
|
print(f"Skipping invalid or anchor-only URL: {url}")
|
|
continue
|
|
|
|
processed_urls.add(cleaned_url)
|
|
|
|
f.write(f"\n---\n\n")
|
|
f.write(f"## Source: [{cleaned_url}]({cleaned_url})\n\n")
|
|
|
|
content = scrape_and_convert_url(cleaned_url, session)
|
|
|
|
if content:
|
|
f.write(content)
|
|
else:
|
|
f.write("*Failed to retrieve or convert content for this URL.*")
|
|
|
|
f.write("\n\n")
|
|
|
|
print(f"\n🎉 Conversion complete! All content saved to '{output_file}'.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pocket_guide_urls = [
|
|
"https://docs.rs/iced/0.13.1/iced/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/advanced/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/application/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/application/struct.Application.html",
|
|
"https://docs.rs/iced/0.13.1/iced/widget/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/widget/struct.Container.html",
|
|
"https://docs.rs/iced/0.13.1/iced/widget/struct.Column.html",
|
|
"https://docs.rs/iced/0.13.1/iced/widget/struct.Row.html",
|
|
"https://docs.rs/iced/0.13.1/iced/enum.Length.html",
|
|
"https://docs.rs/iced/0.13.1/iced/alignment/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/enum.Alignment.html",
|
|
"https://docs.rs/iced/0.13.1/iced/type.Element.html",
|
|
"https://docs.rs/iced/0.13.1/iced/struct.Task.html",
|
|
"https://docs.rs/iced/0.13.1/iced/task/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/struct.Subscription.html",
|
|
"https://docs.rs/iced/0.13.1/iced/stream/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/daemon/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/daemon/struct.Daemon.html",
|
|
"https://docs.rs/iced/0.13.1/iced/theme/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/enum.Theme.html",
|
|
"https://docs.rs/iced/0.13.1/iced/settings/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/settings/struct.Settings.html",
|
|
"https://docs.rs/iced/0.13.1/iced/window/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/keyboard/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/mouse/index.html",
|
|
"https://docs.rs/iced/0.13.1/iced/touch/index.html"
|
|
]
|
|
|
|
OUTPUT_FILENAME = "iced_pocket_guide.md"
|
|
|
|
process_url_list(pocket_guide_urls, OUTPUT_FILENAME) |