RFS/dis/MZ-5Z008/postprocess_dz80.py

#!/usr/bin/env python3
#########################################################################################################
##
## Name:            postprocess_dz80.py
## Created:         March 2026
## Author(s):       Philip Smart
## Description:     Post-processes dz80 disassembly output for GLASS Z80 assembler compatibility.
##                  Adapted for MZ-5Z008 Disk BASIC.
##
##                  Fixes:
##                    1. Undefined Lxxxx label references -> numeric hex values
##                    2. Compacts consecutive same-value DB lines into DS count, value
##                    3. Fixes any string quoting issues
##                    4. Reformats BASIC keyword tables (one keyword per line)
##                    5. Converts runs of printable-ASCII DB bytes into DB "text" form
##
## Credits:
## Copyright:       (c) 2026 Philip Smart <philip.smart@net2net.org>
##
## History:         March 2026 - Initial script (based on MZ-1Z-013B version).
##
#########################################################################################################
## This source file is free software: you can redistribute it and#or modify
## it under the terms of the GNU General Public License as published
## by the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This source file is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
#########################################################################################################

import re
import sys
import os

def collect_defined_labels(lines):
    """Collect all labels defined in the assembly file."""
    defined = set()
    for line in lines:
        # Label definitions (LABEL: instruction)
        m = re.match(r'^([\w\$\.\?][\w\$\.\?]*)\s*:', line)
        if m:
            defined.add(m.group(1))
        # EQU definitions (LABEL EQU value)
        m = re.match(r'^(\S+)\s+EQU\s+', line)
        if m:
            defined.add(m.group(1))
    return defined

def find_undefined_refs(lines, defined):
    """Find all Lxxxx references that are not defined."""
    undefined = set()
    for line in lines:
        for m in re.finditer(r'\bL([0-9A-Fa-f]{4})\b', line):
            label = 'L' + m.group(1).upper()
            if label not in defined:
                undefined.add(label)
    return undefined

def replace_undefined_refs(line, undefined):
    """Replace undefined Lxxxx references with numeric hex values."""
    def replacer(m):
        label = 'L' + m.group(1).upper()
        if label in undefined:
            addr = int(m.group(1), 16)
            return f"0{addr:04X}H"
        return m.group(0)
    return re.sub(r'\bL([0-9A-Fa-f]{4})\b', replacer, line)

def compact_db_sequences(lines):
    """Compact consecutive DB lines with the same value into DS count, value.

    Handles both labeled and unlabeled DB lines. A labeled DB starts a new
    DS group (preserving the label); subsequent unlabeled DBs of the same
    value are folded into that group. Consecutive labeled DBs with the same
    value each get their own DS line (since labels must be preserved).
    Minimum run length for compaction is 2.
    """
    result = []
    i = 0
    # Regex to match unlabeled DB line with a single hex byte value
    db_pat = re.compile(r'^(\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
    # Regex to match labeled DB line
    labeled_db_pat = re.compile(r'^([\w\$\.\?][\w\$\.\?]*:\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')

    def parse_db(line):
        """Return (prefix, value, comment, is_labeled) or None."""
        m = labeled_db_pat.match(line)
        if m:
            return (m.group(1), m.group(2).upper(), m.group(3), True)
        m = db_pat.match(line)
        if m:
            return (m.group(1), m.group(2).upper(), m.group(3), False)
        return None

    while i < len(lines):
        parsed = parse_db(lines[i])
        if parsed is None:
            result.append(lines[i])
            i += 1
            continue

        prefix, value, comment, is_labeled = parsed

        # Scan ahead: count consecutive unlabeled DB lines with the same value
        j = i + 1
        while j < len(lines):
            p2 = parse_db(lines[j])
            if p2 and p2[1] == value and not p2[3]:
                j += 1
            else:
                break

        count = j - i  # total lines in this run (first + unlabeled continuations)

        if count >= 2:
            # Compact to DS count, value
            if comment:
                result.append(f"{prefix}DS      {count}, {value} {comment}\n")
            else:
                result.append(f"{prefix}DS      {count}, {value}\n")
            i = j
        else:
            result.append(lines[i])
            i += 1

    return result

def fix_string_issues(lines):
    """Fix DB lines with problematic string quoting."""
    result = []
    for line in lines:
        # Check for DB lines with multiple quoted strings that might have issues
        if 'DB' in line and line.count('"') > 2:
            # Check if there are unbalanced or nested quotes
            m = re.match(r'^(\s+)DB\s+(.*)', line)
            if m:
                indent = m.group(1)
                content = m.group(2).rstrip()
                # Try to detect nested quote issues
                in_string = False
                quote_count = 0
                for ch in content:
                    if ch == '"':
                        quote_count += 1
                if quote_count % 2 != 0:
                    # Odd number of quotes = broken string, convert all to hex
                    result.append(line)  # Keep as-is, should be caught by assembler
                    continue
        result.append(line)
    return result

def stringify_db_runs(lines):
    """Convert runs of consecutive printable-ASCII DB bytes into DB "text" form.

    Scans for sequences of unlabeled DB lines where each byte is printable ASCII
    (0x20-0x7E). Runs of 3+ printable bytes are merged into a single DB "text"
    line. Non-printable bytes between runs are kept as hex. Labeled DB lines
    start a new output line (preserving the label).
    """
    result = []
    # Pattern for unlabeled single-byte DB
    db_pat = re.compile(r'^(\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
    # Pattern for labeled single-byte DB
    labeled_db_pat = re.compile(
        r'^([\w\$\.\?][\w\$\.\?]*:\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')

    def parse_db(line):
        """Return (prefix, byte_val, comment, is_labeled) or None."""
        m = labeled_db_pat.match(line)
        if m:
            val = int(m.group(2)[1:3], 16)  # strip leading 0 and trailing H
            return (m.group(1), val, m.group(3), True)
        m = db_pat.match(line)
        if m:
            val = int(m.group(2)[1:3], 16)
            return (m.group(1), val, m.group(3), False)
        return None

    def is_printable(b):
        # Printable ASCII excluding double-quote (breaks string literals)
        return 0x20 <= b <= 0x7E and b != 0x22

    def flush_group(prefix, values, comment):
        """Convert a list of byte values into a DB line mixing "text" and hex."""
        parts = []
        text_run = []
        for b in values:
            if is_printable(b):
                text_run.append(chr(b))
            else:
                if len(text_run) >= 3:
                    parts.append('"' + ''.join(text_run) + '"')
                else:
                    for ch_b in text_run:
                        parts.append(f"0{ord(ch_b):02X}H")
                text_run = []
                parts.append(f"0{b:02X}H")
        if len(text_run) >= 3:
            parts.append('"' + ''.join(text_run) + '"')
        else:
            for ch_b in text_run:
                parts.append(f"0{ord(ch_b):02X}H")

        line_str = f"{prefix}DB      {','.join(parts)}"
        if comment:
            line_str += f" {comment}"
        return line_str + "\n"

    i = 0
    while i < len(lines):
        parsed = parse_db(lines[i])
        if parsed is None:
            result.append(lines[i])
            i += 1
            continue

        prefix, byte_val, comment, is_labeled = parsed

        # Collect a run of DB bytes (stop at labeled lines or non-DB lines)
        group_prefix = prefix
        group_comment = comment
        values = [byte_val]
        j = i + 1
        while j < len(lines):
            p2 = parse_db(lines[j])
            if p2 is None:
                break
            if p2[3]:  # labeled line - stop this group
                break
            values.append(p2[1])
            j += 1

        # Only stringify if there are 3+ printable ASCII bytes in the group
        printable_count = sum(1 for b in values if is_printable(b))
        if printable_count >= 3 and len(values) >= 3:
            result.append(flush_group(group_prefix, values, group_comment))
            i = j
        else:
            result.append(lines[i])
            i += 1

    return result

def format_keyword_tables(lines):
    """Reformat BASIC keyword tables so each keyword is on its own line.

    Keywords are bit-7 terminated: the last character of each keyword has
    bit 7 set.  0x80 alone represents an empty/unused slot.

    Output format:
        DB "GOT", "O" | 080H       ; keyword with multiple chars
        DB "A" | 080H              ; single-char keyword
        DB 080H                    ; empty slot
    """
    # Keyword table markers: label -> number of data bytes in table
    # MZ-5Z008 keyword tables
    KW_TABLES = {
        'L415B': 411,       # Statement keyword text table (dz80 label)
        'STMTWORDTBL': 411, # Statement keyword text table (equate label)
        'L42F6': 88,        # Extended keyword text table (dz80 label)
        'EXTWORDTBL': 88,   # Extended keyword text table (equate label)
        'L434E': 158,       # Function keyword text table (dz80 label)
        'FUNCWORDTBL': 158, # Function keyword text table (equate label)
    }

    # Build EQU symbol table for resolving symbol names in DB values
    equ_values = {}
    equ_pat = re.compile(r'^(\S+)\s+EQU\s+(0[0-9A-Fa-f]+H)', re.IGNORECASE)
    for line in lines:
        m = equ_pat.match(line)
        if m:
            equ_values[m.group(1)] = int(m.group(2)[:-1], 16)

    # Regex patterns for DB lines (allow zero or more whitespace after label colon)
    db_pat = re.compile(r'^(\s+)DB\s+(.*?)(\s*;.*)?$')
    labeled_db_pat = re.compile(
        r'^([\w\$\.\?][\w\$\.\?]*:\s*)DB\s+(.*?)(\s*;.*)?$')
    ds_pat = re.compile(r'^(\s+)DS\s+(\d+),\s*(0[0-9A-Fa-f]{2}H)(\s*;.*)?$')
    labeled_ds_pat = re.compile(
        r'^([\w\$\.\?][\w\$\.\?]*:\s*)DS\s+(\d+),\s*(0[0-9A-Fa-f]{2}H)(\s*;.*)?$')

    def parse_hex_val(s):
        """Parse '0xxH' to int."""
        s = s.strip()
        if s.upper().endswith('H') and s.startswith('0'):
            return int(s[:-1], 16)
        return None

    def extract_bytes_from_db(data_str):
        """Extract byte values from a DB operand string (handles hex, strings, DS)."""
        vals = []
        # Split on commas but respect quoted strings
        parts = []
        current = ''
        in_str = False
        for ch in data_str:
            if ch == '"':
                in_str = not in_str
                current += ch
            elif ch == ',' and not in_str:
                parts.append(current.strip())
                current = ''
            else:
                current += ch
        if current.strip():
            parts.append(current.strip())

        for part in parts:
            part = part.strip()
            if part.startswith('"') and part.endswith('"'):
                for ch in part[1:-1]:
                    vals.append(ord(ch))
            elif '|' in part:
                # expression like "X" | 080H
                sub = [s.strip() for s in part.split('|')]
                val = 0
                for s in sub:
                    if s.startswith('"') and s.endswith('"'):
                        val |= ord(s[1])
                    else:
                        v = parse_hex_val(s)
                        if v is not None:
                            val |= v
                vals.append(val)
            else:
                v = parse_hex_val(part)
                if v is not None:
                    vals.append(v)
                elif part in equ_values and equ_values[part] <= 0xFF:
                    vals.append(equ_values[part])
                # else: unknown symbol, skip
        return vals

    def format_keyword(byte_list):
        """Format one keyword's bytes as a DB line."""
        if len(byte_list) == 1 and byte_list[0] == 0x80:
            return "DB      080H"
        if len(byte_list) == 1:
            b = byte_list[0]
            ch = b & 0x7F
            if 0x20 <= ch <= 0x7E and ch != 0x22:
                return f'DB      "{chr(ch)}" | 080H'
            else:
                return f"DB      0{b:02X}H"

        # Multiple bytes: all but last are plain ASCII, last has bit 7
        prefix_bytes = byte_list[:-1]
        last_byte = byte_list[-1]
        last_ch = last_byte & 0x7F

        prefix_str = ''.join(chr(b) for b in prefix_bytes
                             if 0x20 <= b <= 0x7E and b != 0x22)

        if len(prefix_str) == len(prefix_bytes) and prefix_str:
            # All prefix bytes are printable
            if 0x20 <= last_ch <= 0x7E and last_ch != 0x22:
                return f'DB      "{prefix_str}", "{chr(last_ch)}" | 080H'
            else:
                return f'DB      "{prefix_str}", 0{last_byte:02X}H'
        else:
            # Some non-printable in prefix, fall back to hex
            parts = [f"0{b:02X}H" for b in prefix_bytes]
            if 0x20 <= last_ch <= 0x7E and last_ch != 0x22:
                parts.append(f'"{chr(last_ch)}" | 080H')
            else:
                parts.append(f"0{last_byte:02X}H")
            return f"DB      {','.join(parts)}"

    # Regex to extract label from a line
    label_pat = re.compile(r'^([\w\$\.\?][\w\$\.\?]*):\s')

    result = []
    i = 0
    while i < len(lines):
        # Check if this line starts a keyword table
        table_label = None
        table_size = 0
        for label, size in KW_TABLES.items():
            if lines[i].startswith(label + ':'):
                table_label = label
                table_size = size
                break

        if table_label is None:
            result.append(lines[i])
            i += 1
            continue

        # Found a keyword table. Collect all data bytes and track labels.
        raw_bytes = []
        first_comment = None
        # Map: byte offset within table -> label name
        label_at_offset = {0: table_label}
        j = i
        while j < len(lines) and len(raw_bytes) < table_size:
            line = lines[j]
            ml = labeled_db_pat.match(line)
            mu = db_pat.match(line)
            mlds = labeled_ds_pat.match(line)
            muds = ds_pat.match(line)

            if ml or mu:
                m = ml or mu
                if first_comment is None and m.group(m.lastindex):
                    first_comment = m.group(m.lastindex)
                    if first_comment:
                        first_comment = first_comment.strip()
                # Track label at current byte offset
                if ml:
                    lm = label_pat.match(line)
                    if lm and lm.group(1) != table_label:
                        label_at_offset[len(raw_bytes)] = lm.group(1)
                    data_str = ml.group(2)
                else:
                    data_str = mu.group(2)
                raw_bytes.extend(extract_bytes_from_db(data_str))
                j += 1
            elif mlds or muds:
                m = mlds or muds
                if mlds:
                    lm = label_pat.match(line)
                    if lm and lm.group(1) != table_label:
                        label_at_offset[len(raw_bytes)] = lm.group(1)
                    count = int(mlds.group(2))
                    val = parse_hex_val(mlds.group(3))
                else:
                    count = int(muds.group(2))
                    val = parse_hex_val(muds.group(3))
                if val is not None:
                    raw_bytes.extend([val] * count)
                j += 1
            else:
                break

        # Safety: ensure we advance past the starting line
        if j == i:
            result.append(lines[i])
            i += 1
            continue

        # Parse keywords from raw bytes, tracking byte offsets
        keywords = []  # list of (byte_offset, [bytes])
        kw = []
        kw_start = 0
        for idx, b in enumerate(raw_bytes[:table_size]):
            kw.append(b)
            if b & 0x80:
                keywords.append((kw_start, kw))
                kw = []
                kw_start = idx + 1
        if kw:
            keywords.append((kw_start, kw))

        # Output reformatted keyword table with labels preserved.
        # Labels may fall at keyword boundaries OR mid-keyword.
        indent = '            '
        for ki, (kw_offset, kw_bytes) in enumerate(keywords):
            comment = ''
            if ki == 0 and first_comment:
                comment = f" {first_comment}"

            # Check for any labels within this keyword's byte range
            kw_labels = {}  # relative_offset -> label_name
            for off in range(len(kw_bytes)):
                abs_off = kw_offset + off
                if abs_off in label_at_offset:
                    kw_labels[off] = label_at_offset[abs_off]

            if not kw_labels:
                # No labels in this keyword
                result.append(f"{indent}{format_keyword(kw_bytes)}{comment}\n")
            elif 0 in kw_labels and len(kw_labels) == 1:
                # Label at keyword start only
                lbl = kw_labels[0]
                prefix = f"{lbl}:{indent[len(lbl)+1:]}"
                result.append(f"{prefix}{format_keyword(kw_bytes)}{comment}\n")
            else:
                # Labels mid-keyword: split output at each label position.
                # Build split points: 0, label_pos1, label_pos2, ..., end
                split_pts = sorted(set([0] + list(kw_labels.keys())
                                       + [len(kw_bytes)]))
                for si in range(len(split_pts) - 1):
                    seg_start = split_pts[si]
                    seg_end = split_pts[si + 1]
                    chunk = kw_bytes[seg_start:seg_end]
                    lbl = kw_labels.get(seg_start)
                    is_last_seg = (seg_end == len(kw_bytes))

                    if lbl:
                        prefix = f"{lbl}:{indent[len(lbl)+1:]}"
                    else:
                        prefix = indent

                    cmt = comment if si == 0 else ''

                    if is_last_seg and chunk and (chunk[-1] & 0x80):
                        result.append(
                            f"{prefix}{format_keyword(chunk)}{cmt}\n")
                    else:
                        parts = []
                        for b in chunk:
                            if 0x20 <= b <= 0x7E and b != 0x22:
                                parts.append(f"0{b:02X}H")
                            else:
                                parts.append(f"0{b:02X}H")
                        if parts:
                            result.append(
                                f"{prefix}DB      {','.join(parts)}{cmt}\n")

        i = j

    return result

def main():
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <input.asm> [output.asm]")
        print("  Post-processes dz80 output for GLASS Z80 assembler compatibility.")
        sys.exit(1)

    input_file = sys.argv[1]
    output_file = sys.argv[2] if len(sys.argv) > 2 else input_file

    with open(input_file, 'r') as f:
        lines = f.readlines()

    original_count = len(lines)

    # Step 1: Collect defined labels and find undefined references
    defined = collect_defined_labels(lines)
    undefined = find_undefined_refs(lines, defined)
    print(f"  Defined labels: {len(defined)}")
    print(f"  Undefined Lxxxx references: {len(undefined)}")

    # Step 2: Replace undefined Lxxxx references with numeric values
    lines = [replace_undefined_refs(line, undefined) for line in lines]

    # Step 3: Fix string quoting issues
    lines = fix_string_issues(lines)

    # Step 4: Compact consecutive same-value DB lines into DS
    lines = compact_db_sequences(lines)

    # Step 5: Reformat keyword tables (one keyword per line, bit-7 terminator shown)
    # Must run BEFORE stringify so labels inside tables are preserved.
    lines = format_keyword_tables(lines)

    # Step 6: Convert runs of printable-ASCII DB bytes into DB "text" form
    lines = stringify_db_runs(lines)

    print(f"  Lines: {original_count} -> {len(lines)} (compacted {original_count - len(lines)} lines)")

    with open(output_file, 'w') as f:
        f.writelines(lines)

    print(f"  Output: {output_file}")

if __name__ == '__main__':
    main()