import os import requests from bs4 import BeautifulSoup from markdownify import markdownify as md import re # Make sure you have the required libraries: # pip install requests beautifulsoup4 markdownify def sanitize_filename(url): """Creates a safe filename from a URL.""" if "docs.rs/" in url: url = url.split("docs.rs/", 1)[1] sanitized = re.sub(r'[^a-zA-Z0-9\.]+', '_', url) return sanitized.strip('_').lower()[:100] def scrape_and_convert_url(url, session): """ Fetches a single URL, finds its main documentation content, and converts it to Markdown text. """ print(f"Fetching: {url}") try: response = session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # --- THIS IS THE CORRECTED LOGIC --- # First, try to find the main content by its ID, which is the most reliable selector. main_content = soup.find(id="main-content") # If that fails (which is unlikely), fall back to the class name as a safety measure. if not main_content: main_content = soup.find(class_="main-content") # --- END OF CORRECTION --- if main_content: markdown_text = md(str(main_content), heading_style="ATX") print(f" -> Success: Converted content.") return markdown_text else: print(f" -> Warning: Could not find a recognizable main content section in page.") return None except requests.exceptions.RequestException as e: print(f" -> Error: Failed to fetch URL {url}. Reason: {e}") return None except Exception as e: print(f" -> Error: An unexpected error occurred for {url}. Reason: {e}") return None def process_url_list(urls, output_file): """ Processes a list of URLs, converts their content to Markdown, and saves everything into a single output file. """ with requests.Session() as session: session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}) with open(output_file, 'w', encoding='utf-8') as f: f.write("# Iced Pocket Guide\n\n") processed_urls = set() for url in urls: cleaned_url = url.split('#')[0] if not cleaned_url or not cleaned_url.startswith("http") or cleaned_url in processed_urls: if cleaned_url in processed_urls: print(f"Skipping duplicate URL: {cleaned_url}") else: print(f"Skipping invalid or anchor-only URL: {url}") continue processed_urls.add(cleaned_url) f.write(f"\n---\n\n") f.write(f"## Source: [{cleaned_url}]({cleaned_url})\n\n") content = scrape_and_convert_url(cleaned_url, session) if content: f.write(content) else: f.write("*Failed to retrieve or convert content for this URL.*") f.write("\n\n") print(f"\nšŸŽ‰ Conversion complete! All content saved to '{output_file}'.") if __name__ == '__main__': pocket_guide_urls = [ "https://docs.rs/iced/0.13.1/iced/index.html", "https://docs.rs/iced/0.13.1/iced/advanced/index.html", "https://docs.rs/iced/0.13.1/iced/application/index.html", "https://docs.rs/iced/0.13.1/iced/application/struct.Application.html", "https://docs.rs/iced/0.13.1/iced/widget/index.html", "https://docs.rs/iced/0.13.1/iced/widget/struct.Container.html", "https://docs.rs/iced/0.13.1/iced/widget/struct.Column.html", "https://docs.rs/iced/0.13.1/iced/widget/struct.Row.html", "https://docs.rs/iced/0.13.1/iced/enum.Length.html", "https://docs.rs/iced/0.13.1/iced/alignment/index.html", "https://docs.rs/iced/0.13.1/iced/enum.Alignment.html", "https://docs.rs/iced/0.13.1/iced/type.Element.html", "https://docs.rs/iced/0.13.1/iced/struct.Task.html", "https://docs.rs/iced/0.13.1/iced/task/index.html", "https://docs.rs/iced/0.13.1/iced/struct.Subscription.html", "https://docs.rs/iced/0.13.1/iced/stream/index.html", "https://docs.rs/iced/0.13.1/iced/daemon/index.html", "https://docs.rs/iced/0.13.1/iced/daemon/struct.Daemon.html", "https://docs.rs/iced/0.13.1/iced/theme/index.html", "https://docs.rs/iced/0.13.1/iced/enum.Theme.html", "https://docs.rs/iced/0.13.1/iced/settings/index.html", "https://docs.rs/iced/0.13.1/iced/settings/struct.Settings.html", "https://docs.rs/iced/0.13.1/iced/window/index.html", "https://docs.rs/iced/0.13.1/iced/keyboard/index.html", "https://docs.rs/iced/0.13.1/iced/mouse/index.html", "https://docs.rs/iced/0.13.1/iced/touch/index.html" ] OUTPUT_FILENAME = "iced_pocket_guide.md" process_url_list(pocket_guide_urls, OUTPUT_FILENAME)