Hello everyone! Hope everything is fine
I am currently working on developing a web app which translates an input file and generates the output target file in sdlxliff format.
The problem that I am facing right now is that when text is extracted from the input file, it is being extracted as plain text (Formatting is not preserved) but I want the extracted text to be in the format as the input file and the same text formatting should be applied to the final .sdlxliff output file, I am sharing my text extraction and .sdlxliff generation logic here, it is a python code.
I want someone to help me out, to guide me the right text extraction and .sdlxliff creation logic.
My current logic for text extraction is
def extract_text(file_path: str, extension: str) -> List[str]: segments = [] if extension == '.docx': doc = Document(file_path) segments = [para.text.strip() for para in doc.paragraphs if para.text.strip()] elif extension == '.pdf': reader = PdfReader(file_path) for page in reader.pages: page_text = page.extract_text() if page_text: # Split by newlines, filter out empty lines segments.extend(line.strip() for line in page_text.split('\n') if line.strip()) elif extension == '.xlsx': wb = load_workbook(file_path, data_only=True) for sheet_name in wb.sheetnames: ws = wb[sheet_name] for row in ws.iter_rows(values_only=True): row_text = " ".join(str(cell) if cell is not None else "" for cell in row).strip() if row_text: segments.append(row_text) elif extension == '.txt': with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: segments = [line.strip() for line in f if line.strip()] return segments My .sdlxliff generation logic is def create_sdlxliff(source_segments: List[str], translated_segments: List[str], original_filename: str, file_extension: str, source_lang: str, target_lang: str) -> str: """Create an SDLXLIFF file (unchanged from original).""" ET.register_namespace("", "urn:oasis:names:tc:xliff:document:1.2") ET.register_namespace("sdl", "">sdl.com/.../1.0") root = ET.Element("{urn:oasis:names:tc:xliff:document:1.2}xliff", { "version": "1.2", "{sdl.com/.../1.0}version": "1.0" }) file_elem = ET.SubElement(root, "file", { "original": original_filename, "source-language": source_lang, "target-language": target_lang, "datatype": "x-sdlfilterframework2", "{sdl.com/.../1.0}doc-id": str(uuid.uuid4()), "{sdl.com/.../1.0}lastHash": "0", "{sdl.com/.../1.0}logical": "true" }) header_elem = ET.SubElement(file_elem, "header") file_type_id = FILE_TYPE_MAP.get(file_extension.lower(), 'Unknown') filetype_info_elem = ET.SubElement(header_elem, "{sdl.com/.../1.0}filetype-info") filetype_id_elem = ET.SubElement(filetype_info_elem, "{sdl.com/.../1.0}filetype-id") filetype_id_elem.text = file_type_id ET.SubElement(header_elem, "{sdl.com/.../1.0}doc-info") ET.SubElement(header_elem, "{sdl.com/.../1.0}seg-defs") body_elem = ET.SubElement(file_elem, "body") for src, tgt in zip(source_segments, translated_segments): trans_unit = ET.SubElement(body_elem, "trans-unit", { "id": str(uuid.uuid4()), "translate": "yes" }) seg_source_elem = ET.SubElement(trans_unit, "seg-source") mrk_source = ET.SubElement(seg_source_elem, "mrk", {"mid": "1", "mtype": "seg"}) mrk_source.text = saxutils.escape(src) target_elem = ET.SubElement(trans_unit, "target") mrk_target = ET.SubElement(target_elem, "mrk", {"mid": "1", "mtype": "seg"}) mrk_target.text = saxutils.escape(tgt) return ET.tostring(root, encoding="utf-8", xml_declaration=True).decode("utf-8")
Everything apart from text formatting preservation is perfect.
Moved to code block.
[edited by: Paul at 5:10 PM (GMT 0) on 14 Mar 2025]
