Разбираем файл fb2 на текстовые файлами с главами, для загрузки на rulate.
запускать:
python split_fb2.py file_to_split.fb2
import re
import sys
import os
from pathlib import Path
def replace_chapter_number(chapter_filename, new_chapter_number):
"""
Replaces the 3 or 4-digit chapter number in a filename with a new number.
Args:
chapter_filename: The filename string containing a 3 or 4-digit chapter number.
new_chapter_number: The new chapter number (integer or string).
Returns:
The filename string with the replaced chapter number.
"""
new_chapter_number_str = str(new_chapter_number).zfill(4) # Ensure 4 digits, but will work with shorter replacements
def replace_match(match):
return match.group(1) + new_chapter_number_str[-len(match.group(2)):] + match.group(3) #Keeps the length of the original number
return re.sub(r'(\D*)(\d{3,4})(\D*)', replace_match, chapter_filename)
import os
import zipfile
from bs4 import BeautifulSoup
def split_fb2_by_chapters(fb2_filepath, output_dir):
"""
Splits an FB2 file into multiple text files, each containing a chapter.
Args:
fb2_filepath: Path to the FB2 file.
output_dir: Directory to save the chapter files.
"""
try:
os.makedirs(output_dir, exist_ok=True) # Create output directory if it doesn't exist
if fb2_filepath.endswith(".zip"): # Handle zipped FB2 (like .fb2.zip)
with zipfile.ZipFile(fb2_filepath, 'r') as zip_ref:
fb2_filename = [f for f in zip_ref.namelist() if f.endswith(".fb2")][0] #Find the fb2 file within the zip
with zip_ref.open(fb2_filename) as f:
fb2_content = f.read()
else: #Regular FB2 file
with open(fb2_filepath, 'rb') as f: # Open in binary mode to handle encoding
fb2_content = f.read()
soup = BeautifulSoup(fb2_content, 'lxml-xml') # CORRECT: Specify the parser here
i = 1
for section in soup.find_all('section'): # Iterate through all <section> tags (chapters)
title_tag = section.find('title')
# print(i)
if title_tag is not None and title_tag.p is not None: #Check for title tag and p tag within it
chapter_title = title_tag.p.text.strip() # Extract chapter title
elif title_tag is not None: #If there is a title tag but no <p> tag
chapter_title = title_tag.text.strip()
else:
chapter_title = "Untitled Chapter" # Default title if no title tag is found
# Sanitize filename (remove invalid characters)
chapter_filename = "".join(x for x in chapter_title if x.isalnum() or x == " " or x == "_" or x == "-")
#chapter_filename = replace_chapter_number(chapter_filename, 700+i)
i += 1
chapter_filename = chapter_filename[:200] #Limit the file name length
chapter_filename = chapter_filename.replace(" ", " ") + ".txt" # Replace spaces with underscores
chapter_filepath = os.path.join(output_dir, chapter_filename)
chapter_content = ""
for p in section.find_all('p'): #Extract text from <p> tags within the chapter
if p.text.strip().startswith("Глава")==0:
chapter_content += '<p>'+p.text.strip() + '</p>' + "\n"
with open(chapter_filepath, 'w', encoding='utf-8') as outfile: # Use utf-8 encoding
outfile.write(chapter_content)
except Exception as e:
print(f"An error occurred: {e}")
def main():
# Check if file name is provided as command line argument
if len(sys.argv) < 2:
print("Usage: python script.py <input_fb2_file>")
print("Example: python script.py paint-vol-1-final-tl.fb2")
sys.exit(1)
# Get input file name from command line argument
fb2_file = sys.argv[1]
# Create output directory name based on input file name
output_directory = Path(fb2_file).stem + "_output_chapters"
try:
# Validate input file exists
if not os.path.isfile(fb2_file):
raise FileNotFoundError(f"Input file '{fb2_file}' not found")
# Create output directory if it doesn't exist
Path(output_directory).mkdir(parents=True, exist_ok=True)
# Process the file
split_fb2_by_chapters(fb2_file, output_directory)
print(f"FB2 file '{fb2_file}' split into chapters in '{output_directory}'")
except FileNotFoundError as e:
print(f"Error: {e}")
sys.exit(1)
except Exception as e:
print(f"An unexpected error occurred: {e}")
sys.exit(1)
if __name__ == "__main__":
main()