audio-oxide-old/imagdagdos.py

139 lines
7.3 KiB
Python

import os
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re
import time
# Make sure you have the required libraries:
# pip install requests beautifulsoup4 markdownify
def sanitize_filename(url_path):
"""Creates a safe filename from a URL path."""
sanitized = re.sub(r'[^a-zA-Z0-9\.]+', '_', url_path)
if not sanitized or sanitized.endswith('_html'):
sanitized = sanitized.replace('_html', '')
if not sanitized:
return "index.md"
return sanitized.strip('_').lower()[:100] + ".md"
def scrape_and_convert_url(url, session):
"""
Fetches a single URL, finds its main documentation content,
and converts it to Markdown text.
"""
print(f"Fetching: {url}")
try:
response = session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 1. Try to find the most specific ID first.
main_content = soup.find(id="main-content")
# 2. If that fails, fall back to finding the standard <main> HTML tag.
if not main_content:
main_content = soup.find("main")
if main_content:
# Prepend the source URL to the content
header = f"# Source: {url}\n\n"
markdown_text = md(str(main_content), heading_style="ATX")
print(f" -> Success: Converted content.")
return header + markdown_text
else:
print(f" -> Warning: Could not find a recognizable main content section in page.")
return None
except requests.exceptions.RequestException as e:
print(f" -> Error: Failed to fetch URL {url}. Reason: {e}")
return None
except Exception as e:
print(f" -> Error: An unexpected error occurred for {url}. Reason: {e}")
return None
def process_url_list(base_url, paths, output_dir):
"""
Processes a list of URL paths, converts their content to Markdown,
and saves each into a separate file in the output directory.
"""
os.makedirs(output_dir, exist_ok=True)
print(f"Saving files to '{output_dir}/' directory.")
with requests.Session() as session:
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})
processed_urls = set()
for path in paths:
full_url = base_url + path
cleaned_url = full_url.split('#')[0]
if not cleaned_url or not cleaned_url.startswith("http") or cleaned_url in processed_urls:
continue
processed_urls.add(cleaned_url)
content = scrape_and_convert_url(cleaned_url, session)
if content:
filename = sanitize_filename(path)
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
print(f" -> Saved: {filepath}")
time.sleep(0.1)
print(f"\n🎉 Conversion complete! All content saved to the '{output_dir}' directory.")
if __name__ == '__main__':
BASE_URL = "https://doc.rust-lang.org/book/"
URL_PATHS = [
"", "appendix-00.html", "appendix-01-keywords.html", "appendix-02-operators.html",
"appendix-03-derivable-traits.html", "appendix-04-useful-development-tools.html",
"appendix-05-editions.html", "appendix-06-translation.html", "appendix-07-nightly-rust.html",
"ch00-00-introduction.html", "ch01-00-getting-started.html", "ch01-01-installation.html",
"ch01-02-hello-world.html", "ch01-03-hello-cargo.html", "ch02-00-guessing-game-tutorial.html",
"ch03-00-common-programming-concepts.html", "ch03-01-variables-and-mutability.html",
"ch03-02-data-types.html", "ch03-03-how-functions-work.html", "ch03-04-comments.html",
"ch03-05-control-flow.html", "ch04-00-understanding-ownership.html", "ch04-01-what-is-ownership.html",
"ch04-02-references-and-borrowing.html", "ch04-03-slices.html", "ch05-00-structs.html",
"ch05-01-defining-structs.html", "ch05-02-example-structs.html", "ch05-03-method-syntax.html",
"ch06-00-enums.html", "ch06-01-defining-an-enum.html", "ch06-02-match.html", "ch06-03-if-let.html",
"ch07-00-managing-growing-projects-with-packages-crates-and-modules.html", "ch07-01-packages-and-crates.html",
"ch07-02-defining-modules-to-control-scope-and-privacy.html", "ch07-03-paths-for-referring-to-an-item-in-the-module-tree.html",
"ch07-04-bringing-paths-into-scope-with-the-use-keyword.html", "ch07-05-separating-modules-into-different-files.html",
"ch08-00-common-collections.html", "ch08-01-vectors.html", "ch08-02-strings.html",
"ch08-03-hash-maps.html", "ch09-00-error-handling.html", "ch09-01-unrecoverable-errors-with-panic.html",
"ch09-02-recoverable-errors-with-result.html", "ch09-03-to-panic-or-not-to-panic.html",
"ch10-00-generics.html", "ch10-01-syntax.html", "ch10-02-traits.html",
"ch10-03-lifetime-syntax.html", "ch11-00-testing.html", "ch11-01-writing-tests.html",
"ch11-02-running-tests.html", "ch11-03-test-organization.html", "ch12-00-an-io-project.html",
"ch12-01-accepting-command-line-arguments.html", "ch12-02-reading-a-file.html",
"ch12-03-improving-error-handling-and-modularity.html", "ch12-04-testing-the-librarys-functionality.html",
"ch12-05-working-with-environment-variables.html", "ch12-06-writing-to-stderr-instead-of-stdout.html",
"ch13-00-functional-features.html", "ch13-01-closures.html", "ch13-02-iterators.html",
"ch13-03-improving-our-io-project.html", "ch13-04-performance.html", "ch14-00-more-about-cargo.html",
"ch14-01-release-profiles.html", "ch14-02-publishing-to-crates-io.html",
"ch14-03-cargo-workspaces.html", "ch14-04-installing-binaries.html", "ch14-05-extending-cargo.html",
"ch15-00-smart-pointers.html", "ch15-01-box.html", "ch15-02-deref.html", "ch15-03-drop.html",
"ch15-04-rc.html", "ch15-05-interior-mutability.html", "ch15-06-reference-cycles.html",
"ch16-00-concurrency.html", "ch16-01-threads.html", "ch16-02-message-passing.html",
"ch16-03-shared-state.html", "ch16-04-extensible-concurrency-sync-and-send.html",
"ch17-00-async-await.html", "ch17-01-futures-and-syntax.html", "ch17-02-concurrency-with-async.html",
"ch17-03-more-futures.html", "ch17-04-streams.html", "ch17-05-traits-for-async.html",
"ch17-06-futures-tasks-threads.html", "ch18-00-oop.html", "ch18-01-what-is-oo.html",
"ch18-02-trait-objects.html", "ch18-03-oo-design-patterns.html", "ch19-00-patterns.html",
"ch19-01-all-the-places-for-patterns.html", "ch19-02-refutability.html", "ch19-03-pattern-syntax.html",
"ch20-00-advanced-features.html", "ch20-01-unsafe-rust.html", "ch20-02-advanced-traits.html",
"ch20-03-advanced-types.html", "ch20-04-advanced-functions-and-closures.html", "ch20-05-macros.html",
"ch21-00-final-project-a-web-server.html", "ch21-01-single-threaded.html", "ch21-02-multithreaded.html",
"ch21-03-graceful-shutdown-and-cleanup.html"
]
OUTPUT_DIRECTORY = "rust_book_markdown"
process_url_list(BASE_URL, URL_PATHS, OUTPUT_DIRECTORY)