import os from bs4 import BeautifulSoup import requests import random def special(text): replacements = { 'Ä': 'Ä', 'Ö': 'Ö', 'Ü': 'Ü', 'ä': 'ä', 'ö': 'ö', 'ü': 'ü', '>': '>', '<': '<', 'ß': 'ß', '€': '€' } text = text.translate(str.maketrans(replacements)) return text def normal(text): repl = { 'Ä': 'Ä', 'Ö': 'Ö', 'Ü': 'Ü', 'ä': 'ä', 'ö': 'ö', 'ü': 'ü', '>': '>', '<': '<', 'ß': 'ß', '€': '€' } for key in repl: text = text.replace(key, repl[key]) return text def replace(intext): url = "http:(...GEHEIMTRATSCH...)/api/v1/chat" headers = {"Content-Type": "application/json"} data = { "model": "meta-llama-3.1-8b-instruct", "input": "Der nachfolgende Text ist aus verschiedenen Sätzen und Worten zusammengesetzt. " "Ändere nur die Kapitalisierung der Anfangsbuchstaben, falls es dir im jeweiligen Kontext nach deutscher Schreibweise richtig erscheint. " "Alle Leerzeichen werden unverändert übernommen. Kopiere die benutzte Schreibweise der Worte, auch wenn sie falsch ist. " "Danach überprüfe nochmals ob wirklich kein einziges Zeichen verändert oder hinzugefügt wurde und korrigiere dies gegebenenfalls. Kommentiere die Antwort nicht.\n\n" + intext } response = requests.post(url, headers=headers, json=data).json() for item in response["output"]: if item["type"] == "message": out_text = item["content"].replace("\n", "").replace(", dass ", ", daß ").replace(", Dass ", ", daß ").replace(", Daß ", ", daß ").replace(", Der ", ", der ").replace(", Die ", ", die ").replace(", Das ", ", das ") return out_text input_folder = "C:\code\in" output_folder = "C:\code\out" for filename in os.listdir(input_folder): loop = 1 file_path = os.path.join(input_folder, filename) try: with open(file_path, "r", encoding="utf-8") as f: input_content = f.read() except UnicodeDecodeError: with open(file_path, "r", encoding="latin-1", errors='ignore') as f: input_content = f.read() soup = BeautifulSoup(input_content, "html.parser") replace_dict = dict() for element in soup.find_all(string=True): if element.parent.name not in ["script", "style", "comment"]: original_text = element.string.strip() if "w3c//dtd" not in original_text.lower(): if len(original_text) > 1: key = normal(original_text) replace_dict.update({key: ""}) overall = len(replace_dict) found = 0 for key in replace_dict.keys(): if len(key.split()) == 1: fixed = key[0].upper() + key[1:] replace_dict.update({key: fixed}) found += 1 while found != overall: num_left = 30 intext = "" items = list(replace_dict.items()) random.shuffle(items) for key, value in items: if (value == "") and (num_left > 0): key_short = key.replace("\n", " ").replace(" ", " ").replace(" ", " ").replace(" ", " ").replace(" ", " ").replace(" ", " ").replace(" ", " ").strip() intext += " " + key_short + " " num_left -= 1 if intext != "": out_text = replace(intext) print("\nIN ---- " + intext) print("\nOUT ---- " + out_text) for key in replace_dict.keys(): key_short = key.replace("\n", " ").replace(" ", " ").replace(" ", " ").replace(" ", " ").replace(" ", " ").replace(" ", " ").replace(" ", " ").strip() if replace_dict[key] == "": if found == (overall-1): replace_dict[key] = out_text found += 1 else: limit = min(len(key_short), max(int(overall/10), 70) -(int(loop*found/overall))) found_pos = out_text.lower().find(key_short[:limit].lower()) if found_pos >= 0: replace_dict[key] = out_text[found_pos : found_pos+len(key_short)] found += 1 else: found_pos = out_text.lower().find(key_short[:limit].lower().replace(", dass", ", daß")) if found_pos >= 0: occurrences = out_text.count(", daß") replace_dict[key] = out_text[found_pos : found_pos+len(key_short)-occurrences] found += 1 print("\nPROGRESS ---- " + str(found) +"/"+str(overall)) loop += 1 head = input_content.find("") if head != -1: result_content = input_content[:head + len("")] else: result_content = input_content input_content = input_content[head + 7:] num = 0 inlist = list() while input_content: tag_pos=-1 if input_content.startswith('<'): tag_pos = input_content.find('>', 1) if tag_pos != -1: input_chunk = input_content[:tag_pos + 1] input_content = input_content[tag_pos + 1:] elif input_content.find('<', 1) != -1: tag_pos = input_content.find('<', 1) input_chunk = normal(input_content[:tag_pos]) input_content = input_content[tag_pos:] else: input_chunk = input_content input_content = None inlist.append(input_chunk) for entry_num in range (1, len(inlist)): entry = inlist[entry_num] entry_key = normal(entry).strip() # tags as is if entry[0] == "<": result_content += entry elif entry_key.replace(" ", "") in replace_dict.keys(): replacement = replace_dict[entry_key.replace(" ", "")] result_chunk = entry in_pos = 0 out_pos = 0 elements = entry_key.split() for element in elements: element_nobsp = element.replace(" ", "") ipos = entry[in_pos:].find(element_nobsp) if ipos != -1: in_pos += ipos opos = replacement[out_pos:].lower().find(element_nobsp.lower()) if opos != -1: out_pos += opos out_element = replacement[out_pos : out_pos+len(element_nobsp)] result_chunk = result_chunk[:in_pos] + out_element + result_chunk[in_pos + len(out_element):] result_content += special(result_chunk) else: result_content += special(entry) file_path = os.path.join(output_folder, filename) with open(file_path, 'w', encoding='utf-8') as f: f.write(result_content)