Seeking guideline to preserve formatting

Hello everyone! Hope everything is fine

I am currently working on developing a web app which translates an input file and generates the output target file in sdlxliff format. 

The problem that I am facing right now is that when text is extracted from the input file, it is being extracted as plain text (Formatting is not preserved)  but I want the extracted text to be in the format as the input file and the same text formatting should be applied to the final .sdlxliff output file, I am sharing my text extraction and .sdlxliff generation logic here, it is a python code. 

I want someone to help me out, to guide me the right text extraction and .sdlxliff creation logic.

My current logic for text extraction is

def extract_text(file_path: str, extension: str) -> List[str]:
   
    segments = []
    if extension == '.docx':
        doc = Document(file_path)
        segments = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
    elif extension == '.pdf':
        reader = PdfReader(file_path)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                # Split by newlines, filter out empty lines
                segments.extend(line.strip() for line in page_text.split('\n') if line.strip())
    elif extension == '.xlsx':
        wb = load_workbook(file_path, data_only=True)
        for sheet_name in wb.sheetnames:
            ws = wb[sheet_name]
            for row in ws.iter_rows(values_only=True):
                row_text = " ".join(str(cell) if cell is not None else "" for cell in row).strip()
                if row_text:
                    segments.append(row_text)
    elif extension == '.txt':
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            segments = [line.strip() for line in f if line.strip()]
    return segments
My .sdlxliff generation logic is
def create_sdlxliff(source_segments: List[str], translated_segments: List[str], original_filename: str,
                   file_extension: str, source_lang: str, target_lang: str) -> str:
    """Create an SDLXLIFF file (unchanged from original)."""
    ET.register_namespace("", "urn:oasis:names:tc:xliff:document:1.2")
    ET.register_namespace("sdl", "">sdl.com/.../1.0")

    root = ET.Element("{urn:oasis:names:tc:xliff:document:1.2}xliff", {
        "version": "1.2",
        "{sdl.com/.../1.0}version": "1.0"
    })

    file_elem = ET.SubElement(root, "file", {
        "original": original_filename,
        "source-language": source_lang,
        "target-language": target_lang,
        "datatype": "x-sdlfilterframework2",
        "{sdl.com/.../1.0}doc-id": str(uuid.uuid4()),
        "{sdl.com/.../1.0}lastHash": "0",
        "{sdl.com/.../1.0}logical": "true"
    })

    header_elem = ET.SubElement(file_elem, "header")
    file_type_id = FILE_TYPE_MAP.get(file_extension.lower(), 'Unknown')
    filetype_info_elem = ET.SubElement(header_elem, "{sdl.com/.../1.0}filetype-info")
    filetype_id_elem = ET.SubElement(filetype_info_elem, "{sdl.com/.../1.0}filetype-id")
    filetype_id_elem.text = file_type_id
    ET.SubElement(header_elem, "{sdl.com/.../1.0}doc-info")
    ET.SubElement(header_elem, "{sdl.com/.../1.0}seg-defs")

    body_elem = ET.SubElement(file_elem, "body")
    for src, tgt in zip(source_segments, translated_segments):
        trans_unit = ET.SubElement(body_elem, "trans-unit", {
            "id": str(uuid.uuid4()),
            "translate": "yes"
        })
        seg_source_elem = ET.SubElement(trans_unit, "seg-source")
        mrk_source = ET.SubElement(seg_source_elem, "mrk", {"mid": "1", "mtype": "seg"})
        mrk_source.text = saxutils.escape(src)
        target_elem = ET.SubElement(trans_unit, "target")
        mrk_target = ET.SubElement(target_elem, "mrk", {"mid": "1", "mtype": "seg"})
        mrk_target.text = saxutils.escape(tgt)

    return ET.tostring(root, encoding="utf-8", xml_declaration=True).decode("utf-8")



Everything apart from text formatting preservation is perfect.


Moved to code block.
[edited by: Paul at 5:10 PM (GMT 0) on 14 Mar 2025]
emoji