Составляем словарь имен для перевода

Comments

Это делается в 2 этапа - сначала проходим по переводу и с помощью LLM выбираем все собственные имена, промптом:

sys_prompt_names =    f"""
            identify all personal names in text fragment and return them separated by comma. Don't add conmment and don't translate -  """

сохраняя все имена в текстовый файл, потом делаем список имен уникальным:

import codecs
from collections import Counter
import argparse

def process_unique_names(input_filename, output_filename='unique-names.txt'):
    """
    Reads a large file, counts unique names, and writes results to a file.
    """
    name_counts = Counter()

    print(f"Reading {input_filename}...")
    
    try:
        # Use a context manager to ensure file handles are closed automatically
        with codecs.open(input_filename, 'r', 'utf-8') as file:
            for line in file:
                # Normalize separators (Chinese commas to standard commas)
                line = line.replace(",", ",").replace("、", ",")
                
                # Split and clean names
                names = [n.strip() for n in line.split(',') if n.strip()]
                
                for name in names:
                    # Filter out noise (like lines containing full stops)
                    if "。" in name:
                        continue
                    name_counts[name] += 1

        print(f"Found {len(name_counts)} unique names. Writing to {output_filename}...")

        with open(output_filename, 'w', encoding='utf-8') as file_o:
            for name, count in name_counts.items():
                file_o.write(f"{name},{count}\n")
        
        print("Success.")

    except FileNotFoundError:
        print(f"Error: File '{input_filename}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate a unique list of names with counts from a file.")
    parser.add_argument("filename", help="Path to the input text file.")
    args = parser.parse_args()

    process_unique_names(args.filename)


Ну а в получившийся csv добавляем колонку с переводом и полом и подсовываем его скрипту перевода.