import os import requests from bs4 import BeautifulSoup from markdownify import markdownify as md import re import time # Make sure you have the required libraries: # pip install requests beautifulsoup4 markdownify def sanitize_filename(url_path): """Creates a safe filename from a URL path.""" sanitized = re.sub(r'[^a-zA-Z0-9\.]+', '_', url_path) if not sanitized or sanitized.endswith('_html'): sanitized = sanitized.replace('_html', '') if not sanitized: return "index.md" return sanitized.strip('_').lower()[:100] + ".md" def scrape_and_convert_url(url, session): """ Fetches a single URL, finds its main documentation content, and converts it to Markdown text. """ print(f"Fetching: {url}") try: response = session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # 1. Try to find the most specific ID first. main_content = soup.find(id="main-content") # 2. If that fails, fall back to finding the standard
HTML tag. if not main_content: main_content = soup.find("main") if main_content: # Prepend the source URL to the content header = f"# Source: {url}\n\n" markdown_text = md(str(main_content), heading_style="ATX") print(f" -> Success: Converted content.") return header + markdown_text else: print(f" -> Warning: Could not find a recognizable main content section in page.") return None except requests.exceptions.RequestException as e: print(f" -> Error: Failed to fetch URL {url}. Reason: {e}") return None except Exception as e: print(f" -> Error: An unexpected error occurred for {url}. Reason: {e}") return None def process_url_list(base_url, paths, output_dir): """ Processes a list of URL paths, converts their content to Markdown, and saves each into a separate file in the output directory. """ os.makedirs(output_dir, exist_ok=True) print(f"Saving files to '{output_dir}/' directory.") with requests.Session() as session: session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}) processed_urls = set() for path in paths: full_url = base_url + path cleaned_url = full_url.split('#')[0] if not cleaned_url or not cleaned_url.startswith("http") or cleaned_url in processed_urls: continue processed_urls.add(cleaned_url) content = scrape_and_convert_url(cleaned_url, session) if content: filename = sanitize_filename(path) filepath = os.path.join(output_dir, filename) with open(filepath, 'w', encoding='utf-8') as f: f.write(content) print(f" -> Saved: {filepath}") time.sleep(0.1) print(f"\nšŸŽ‰ Conversion complete! All content saved to the '{output_dir}' directory.") if __name__ == '__main__': BASE_URL = "https://doc.rust-lang.org/book/" URL_PATHS = [ "", "appendix-00.html", "appendix-01-keywords.html", "appendix-02-operators.html", "appendix-03-derivable-traits.html", "appendix-04-useful-development-tools.html", "appendix-05-editions.html", "appendix-06-translation.html", "appendix-07-nightly-rust.html", "ch00-00-introduction.html", "ch01-00-getting-started.html", "ch01-01-installation.html", "ch01-02-hello-world.html", "ch01-03-hello-cargo.html", "ch02-00-guessing-game-tutorial.html", "ch03-00-common-programming-concepts.html", "ch03-01-variables-and-mutability.html", "ch03-02-data-types.html", "ch03-03-how-functions-work.html", "ch03-04-comments.html", "ch03-05-control-flow.html", "ch04-00-understanding-ownership.html", "ch04-01-what-is-ownership.html", "ch04-02-references-and-borrowing.html", "ch04-03-slices.html", "ch05-00-structs.html", "ch05-01-defining-structs.html", "ch05-02-example-structs.html", "ch05-03-method-syntax.html", "ch06-00-enums.html", "ch06-01-defining-an-enum.html", "ch06-02-match.html", "ch06-03-if-let.html", "ch07-00-managing-growing-projects-with-packages-crates-and-modules.html", "ch07-01-packages-and-crates.html", "ch07-02-defining-modules-to-control-scope-and-privacy.html", "ch07-03-paths-for-referring-to-an-item-in-the-module-tree.html", "ch07-04-bringing-paths-into-scope-with-the-use-keyword.html", "ch07-05-separating-modules-into-different-files.html", "ch08-00-common-collections.html", "ch08-01-vectors.html", "ch08-02-strings.html", "ch08-03-hash-maps.html", "ch09-00-error-handling.html", "ch09-01-unrecoverable-errors-with-panic.html", "ch09-02-recoverable-errors-with-result.html", "ch09-03-to-panic-or-not-to-panic.html", "ch10-00-generics.html", "ch10-01-syntax.html", "ch10-02-traits.html", "ch10-03-lifetime-syntax.html", "ch11-00-testing.html", "ch11-01-writing-tests.html", "ch11-02-running-tests.html", "ch11-03-test-organization.html", "ch12-00-an-io-project.html", "ch12-01-accepting-command-line-arguments.html", "ch12-02-reading-a-file.html", "ch12-03-improving-error-handling-and-modularity.html", "ch12-04-testing-the-librarys-functionality.html", "ch12-05-working-with-environment-variables.html", "ch12-06-writing-to-stderr-instead-of-stdout.html", "ch13-00-functional-features.html", "ch13-01-closures.html", "ch13-02-iterators.html", "ch13-03-improving-our-io-project.html", "ch13-04-performance.html", "ch14-00-more-about-cargo.html", "ch14-01-release-profiles.html", "ch14-02-publishing-to-crates-io.html", "ch14-03-cargo-workspaces.html", "ch14-04-installing-binaries.html", "ch14-05-extending-cargo.html", "ch15-00-smart-pointers.html", "ch15-01-box.html", "ch15-02-deref.html", "ch15-03-drop.html", "ch15-04-rc.html", "ch15-05-interior-mutability.html", "ch15-06-reference-cycles.html", "ch16-00-concurrency.html", "ch16-01-threads.html", "ch16-02-message-passing.html", "ch16-03-shared-state.html", "ch16-04-extensible-concurrency-sync-and-send.html", "ch17-00-async-await.html", "ch17-01-futures-and-syntax.html", "ch17-02-concurrency-with-async.html", "ch17-03-more-futures.html", "ch17-04-streams.html", "ch17-05-traits-for-async.html", "ch17-06-futures-tasks-threads.html", "ch18-00-oop.html", "ch18-01-what-is-oo.html", "ch18-02-trait-objects.html", "ch18-03-oo-design-patterns.html", "ch19-00-patterns.html", "ch19-01-all-the-places-for-patterns.html", "ch19-02-refutability.html", "ch19-03-pattern-syntax.html", "ch20-00-advanced-features.html", "ch20-01-unsafe-rust.html", "ch20-02-advanced-traits.html", "ch20-03-advanced-types.html", "ch20-04-advanced-functions-and-closures.html", "ch20-05-macros.html", "ch21-00-final-project-a-web-server.html", "ch21-01-single-threaded.html", "ch21-02-multithreaded.html", "ch21-03-graceful-shutdown-and-cleanup.html" ] OUTPUT_DIRECTORY = "rust_book_markdown" process_url_list(BASE_URL, URL_PATHS, OUTPUT_DIRECTORY)