Hello everyone! Hope everything is fine
I am currently working on developing a web app which translates an input file and generates the output target file in sdlxliff format.
The problem that I am facing right now is that when text is extracted from the input file, it is being extracted as plain text (Formatting is not preserved) but I want the extracted text to be in the format as the input file and the same text formatting should be applied to the final .sdlxliff output file, I am sharing my text extraction and .sdlxliff generation logic here, it is a python code.
I want someone to help me out, to guide me the right text extraction and .sdlxliff creation logic.
My current logic for text extraction is
def extract_text(file_path: str, extension: str) -> List[str]:
segments = []
if extension == '.docx':
doc = Document(file_path)
segments = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
elif extension == '.pdf':
reader = PdfReader(file_path)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
# Split by newlines, filter out empty lines
segments.extend(line.strip() for line in page_text.split('\n') if line.strip())
elif extension == '.xlsx':
wb = load_workbook(file_path, data_only=True)
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
for row in ws.iter_rows(values_only=True):
row_text = " ".join(str(cell) if cell is not None else "" for cell in row).strip()
if row_text:
segments.append(row_text)
elif extension == '.txt':
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
segments = [line.strip() for line in f if line.strip()]
return segments
My .sdlxliff generation logic is
def create_sdlxliff(source_segments: List[str], translated_segments: List[str], original_filename: str,
file_extension: str, source_lang: str, target_lang: str) -> str:
"""Create an SDLXLIFF file (unchanged from original)."""
ET.register_namespace("", "urn:oasis:names:tc:xliff:document:1.2")
ET.register_namespace("sdl", "">sdl.com/.../1.0")
root = ET.Element("{urn:oasis:names:tc:xliff:document:1.2}xliff", {
"version": "1.2",
"{sdl.com/.../1.0}version": "1.0"
})
file_elem = ET.SubElement(root, "file", {
"original": original_filename,
"source-language": source_lang,
"target-language": target_lang,
"datatype": "x-sdlfilterframework2",
"{sdl.com/.../1.0}doc-id": str(uuid.uuid4()),
"{sdl.com/.../1.0}lastHash": "0",
"{sdl.com/.../1.0}logical": "true"
})
header_elem = ET.SubElement(file_elem, "header")
file_type_id = FILE_TYPE_MAP.get(file_extension.lower(), 'Unknown')
filetype_info_elem = ET.SubElement(header_elem, "{sdl.com/.../1.0}filetype-info")
filetype_id_elem = ET.SubElement(filetype_info_elem, "{sdl.com/.../1.0}filetype-id")
filetype_id_elem.text = file_type_id
ET.SubElement(header_elem, "{sdl.com/.../1.0}doc-info")
ET.SubElement(header_elem, "{sdl.com/.../1.0}seg-defs")
body_elem = ET.SubElement(file_elem, "body")
for src, tgt in zip(source_segments, translated_segments):
trans_unit = ET.SubElement(body_elem, "trans-unit", {
"id": str(uuid.uuid4()),
"translate": "yes"
})
seg_source_elem = ET.SubElement(trans_unit, "seg-source")
mrk_source = ET.SubElement(seg_source_elem, "mrk", {"mid": "1", "mtype": "seg"})
mrk_source.text = saxutils.escape(src)
target_elem = ET.SubElement(trans_unit, "target")
mrk_target = ET.SubElement(target_elem, "mrk", {"mid": "1", "mtype": "seg"})
mrk_target.text = saxutils.escape(tgt)
return ET.tostring(root, encoding="utf-8", xml_declaration=True).decode("utf-8")Everything apart from text formatting preservation is perfect.
Moved to code block.
[edited by: Paul at 5:10 PM (GMT 0) on 14 Mar 2025]

Translate