Files
RFS/dis/MZ-5Z008/postprocess_dz80.py

565 lines
21 KiB
Python
Vendored

#!/usr/bin/env python3
#########################################################################################################
##
## Name: postprocess_dz80.py
## Created: March 2026
## Author(s): Philip Smart
## Description: Post-processes dz80 disassembly output for GLASS Z80 assembler compatibility.
## Adapted for MZ-5Z008 Disk BASIC.
##
## Fixes:
## 1. Undefined Lxxxx label references -> numeric hex values
## 2. Compacts consecutive same-value DB lines into DS count, value
## 3. Fixes any string quoting issues
## 4. Reformats BASIC keyword tables (one keyword per line)
## 5. Converts runs of printable-ASCII DB bytes into DB "text" form
##
## Credits:
## Copyright: (c) 2026 Philip Smart <philip.smart@net2net.org>
##
## History: March 2026 - Initial script (based on MZ-1Z-013B version).
##
#########################################################################################################
## This source file is free software: you can redistribute it and#or modify
## it under the terms of the GNU General Public License as published
## by the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This source file is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
#########################################################################################################
import re
import sys
import os
def collect_defined_labels(lines):
"""Collect all labels defined in the assembly file."""
defined = set()
for line in lines:
# Label definitions (LABEL: instruction)
m = re.match(r'^([\w\$\.\?][\w\$\.\?]*)\s*:', line)
if m:
defined.add(m.group(1))
# EQU definitions (LABEL EQU value)
m = re.match(r'^(\S+)\s+EQU\s+', line)
if m:
defined.add(m.group(1))
return defined
def find_undefined_refs(lines, defined):
"""Find all Lxxxx references that are not defined."""
undefined = set()
for line in lines:
for m in re.finditer(r'\bL([0-9A-Fa-f]{4})\b', line):
label = 'L' + m.group(1).upper()
if label not in defined:
undefined.add(label)
return undefined
def replace_undefined_refs(line, undefined):
"""Replace undefined Lxxxx references with numeric hex values."""
def replacer(m):
label = 'L' + m.group(1).upper()
if label in undefined:
addr = int(m.group(1), 16)
return f"0{addr:04X}H"
return m.group(0)
return re.sub(r'\bL([0-9A-Fa-f]{4})\b', replacer, line)
def compact_db_sequences(lines):
"""Compact consecutive DB lines with the same value into DS count, value.
Handles both labeled and unlabeled DB lines. A labeled DB starts a new
DS group (preserving the label); subsequent unlabeled DBs of the same
value are folded into that group. Consecutive labeled DBs with the same
value each get their own DS line (since labels must be preserved).
Minimum run length for compaction is 2.
"""
result = []
i = 0
# Regex to match unlabeled DB line with a single hex byte value
db_pat = re.compile(r'^(\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
# Regex to match labeled DB line
labeled_db_pat = re.compile(r'^([\w\$\.\?][\w\$\.\?]*:\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
def parse_db(line):
"""Return (prefix, value, comment, is_labeled) or None."""
m = labeled_db_pat.match(line)
if m:
return (m.group(1), m.group(2).upper(), m.group(3), True)
m = db_pat.match(line)
if m:
return (m.group(1), m.group(2).upper(), m.group(3), False)
return None
while i < len(lines):
parsed = parse_db(lines[i])
if parsed is None:
result.append(lines[i])
i += 1
continue
prefix, value, comment, is_labeled = parsed
# Scan ahead: count consecutive unlabeled DB lines with the same value
j = i + 1
while j < len(lines):
p2 = parse_db(lines[j])
if p2 and p2[1] == value and not p2[3]:
j += 1
else:
break
count = j - i # total lines in this run (first + unlabeled continuations)
if count >= 2:
# Compact to DS count, value
if comment:
result.append(f"{prefix}DS {count}, {value} {comment}\n")
else:
result.append(f"{prefix}DS {count}, {value}\n")
i = j
else:
result.append(lines[i])
i += 1
return result
def fix_string_issues(lines):
"""Fix DB lines with problematic string quoting."""
result = []
for line in lines:
# Check for DB lines with multiple quoted strings that might have issues
if 'DB' in line and line.count('"') > 2:
# Check if there are unbalanced or nested quotes
m = re.match(r'^(\s+)DB\s+(.*)', line)
if m:
indent = m.group(1)
content = m.group(2).rstrip()
# Try to detect nested quote issues
in_string = False
quote_count = 0
for ch in content:
if ch == '"':
quote_count += 1
if quote_count % 2 != 0:
# Odd number of quotes = broken string, convert all to hex
result.append(line) # Keep as-is, should be caught by assembler
continue
result.append(line)
return result
def stringify_db_runs(lines):
"""Convert runs of consecutive printable-ASCII DB bytes into DB "text" form.
Scans for sequences of unlabeled DB lines where each byte is printable ASCII
(0x20-0x7E). Runs of 3+ printable bytes are merged into a single DB "text"
line. Non-printable bytes between runs are kept as hex. Labeled DB lines
start a new output line (preserving the label).
"""
result = []
# Pattern for unlabeled single-byte DB
db_pat = re.compile(r'^(\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
# Pattern for labeled single-byte DB
labeled_db_pat = re.compile(
r'^([\w\$\.\?][\w\$\.\?]*:\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
def parse_db(line):
"""Return (prefix, byte_val, comment, is_labeled) or None."""
m = labeled_db_pat.match(line)
if m:
val = int(m.group(2)[1:3], 16) # strip leading 0 and trailing H
return (m.group(1), val, m.group(3), True)
m = db_pat.match(line)
if m:
val = int(m.group(2)[1:3], 16)
return (m.group(1), val, m.group(3), False)
return None
def is_printable(b):
# Printable ASCII excluding double-quote (breaks string literals)
return 0x20 <= b <= 0x7E and b != 0x22
def flush_group(prefix, values, comment):
"""Convert a list of byte values into a DB line mixing "text" and hex."""
parts = []
text_run = []
for b in values:
if is_printable(b):
text_run.append(chr(b))
else:
if len(text_run) >= 3:
parts.append('"' + ''.join(text_run) + '"')
else:
for ch_b in text_run:
parts.append(f"0{ord(ch_b):02X}H")
text_run = []
parts.append(f"0{b:02X}H")
if len(text_run) >= 3:
parts.append('"' + ''.join(text_run) + '"')
else:
for ch_b in text_run:
parts.append(f"0{ord(ch_b):02X}H")
line_str = f"{prefix}DB {','.join(parts)}"
if comment:
line_str += f" {comment}"
return line_str + "\n"
i = 0
while i < len(lines):
parsed = parse_db(lines[i])
if parsed is None:
result.append(lines[i])
i += 1
continue
prefix, byte_val, comment, is_labeled = parsed
# Collect a run of DB bytes (stop at labeled lines or non-DB lines)
group_prefix = prefix
group_comment = comment
values = [byte_val]
j = i + 1
while j < len(lines):
p2 = parse_db(lines[j])
if p2 is None:
break
if p2[3]: # labeled line - stop this group
break
values.append(p2[1])
j += 1
# Only stringify if there are 3+ printable ASCII bytes in the group
printable_count = sum(1 for b in values if is_printable(b))
if printable_count >= 3 and len(values) >= 3:
result.append(flush_group(group_prefix, values, group_comment))
i = j
else:
result.append(lines[i])
i += 1
return result
def format_keyword_tables(lines):
"""Reformat BASIC keyword tables so each keyword is on its own line.
Keywords are bit-7 terminated: the last character of each keyword has
bit 7 set. 0x80 alone represents an empty/unused slot.
Output format:
DB "GOT", "O" | 080H ; keyword with multiple chars
DB "A" | 080H ; single-char keyword
DB 080H ; empty slot
"""
# Keyword table markers: label -> number of data bytes in table
# MZ-5Z008 keyword tables
KW_TABLES = {
'L415B': 411, # Statement keyword text table (dz80 label)
'STMTWORDTBL': 411, # Statement keyword text table (equate label)
'L42F6': 88, # Extended keyword text table (dz80 label)
'EXTWORDTBL': 88, # Extended keyword text table (equate label)
'L434E': 158, # Function keyword text table (dz80 label)
'FUNCWORDTBL': 158, # Function keyword text table (equate label)
}
# Build EQU symbol table for resolving symbol names in DB values
equ_values = {}
equ_pat = re.compile(r'^(\S+)\s+EQU\s+(0[0-9A-Fa-f]+H)', re.IGNORECASE)
for line in lines:
m = equ_pat.match(line)
if m:
equ_values[m.group(1)] = int(m.group(2)[:-1], 16)
# Regex patterns for DB lines (allow zero or more whitespace after label colon)
db_pat = re.compile(r'^(\s+)DB\s+(.*?)(\s*;.*)?$')
labeled_db_pat = re.compile(
r'^([\w\$\.\?][\w\$\.\?]*:\s*)DB\s+(.*?)(\s*;.*)?$')
ds_pat = re.compile(r'^(\s+)DS\s+(\d+),\s*(0[0-9A-Fa-f]{2}H)(\s*;.*)?$')
labeled_ds_pat = re.compile(
r'^([\w\$\.\?][\w\$\.\?]*:\s*)DS\s+(\d+),\s*(0[0-9A-Fa-f]{2}H)(\s*;.*)?$')
def parse_hex_val(s):
"""Parse '0xxH' to int."""
s = s.strip()
if s.upper().endswith('H') and s.startswith('0'):
return int(s[:-1], 16)
return None
def extract_bytes_from_db(data_str):
"""Extract byte values from a DB operand string (handles hex, strings, DS)."""
vals = []
# Split on commas but respect quoted strings
parts = []
current = ''
in_str = False
for ch in data_str:
if ch == '"':
in_str = not in_str
current += ch
elif ch == ',' and not in_str:
parts.append(current.strip())
current = ''
else:
current += ch
if current.strip():
parts.append(current.strip())
for part in parts:
part = part.strip()
if part.startswith('"') and part.endswith('"'):
for ch in part[1:-1]:
vals.append(ord(ch))
elif '|' in part:
# expression like "X" | 080H
sub = [s.strip() for s in part.split('|')]
val = 0
for s in sub:
if s.startswith('"') and s.endswith('"'):
val |= ord(s[1])
else:
v = parse_hex_val(s)
if v is not None:
val |= v
vals.append(val)
else:
v = parse_hex_val(part)
if v is not None:
vals.append(v)
elif part in equ_values and equ_values[part] <= 0xFF:
vals.append(equ_values[part])
# else: unknown symbol, skip
return vals
def format_keyword(byte_list):
"""Format one keyword's bytes as a DB line."""
if len(byte_list) == 1 and byte_list[0] == 0x80:
return "DB 080H"
if len(byte_list) == 1:
b = byte_list[0]
ch = b & 0x7F
if 0x20 <= ch <= 0x7E and ch != 0x22:
return f'DB "{chr(ch)}" | 080H'
else:
return f"DB 0{b:02X}H"
# Multiple bytes: all but last are plain ASCII, last has bit 7
prefix_bytes = byte_list[:-1]
last_byte = byte_list[-1]
last_ch = last_byte & 0x7F
prefix_str = ''.join(chr(b) for b in prefix_bytes
if 0x20 <= b <= 0x7E and b != 0x22)
if len(prefix_str) == len(prefix_bytes) and prefix_str:
# All prefix bytes are printable
if 0x20 <= last_ch <= 0x7E and last_ch != 0x22:
return f'DB "{prefix_str}", "{chr(last_ch)}" | 080H'
else:
return f'DB "{prefix_str}", 0{last_byte:02X}H'
else:
# Some non-printable in prefix, fall back to hex
parts = [f"0{b:02X}H" for b in prefix_bytes]
if 0x20 <= last_ch <= 0x7E and last_ch != 0x22:
parts.append(f'"{chr(last_ch)}" | 080H')
else:
parts.append(f"0{last_byte:02X}H")
return f"DB {','.join(parts)}"
# Regex to extract label from a line
label_pat = re.compile(r'^([\w\$\.\?][\w\$\.\?]*):\s')
result = []
i = 0
while i < len(lines):
# Check if this line starts a keyword table
table_label = None
table_size = 0
for label, size in KW_TABLES.items():
if lines[i].startswith(label + ':'):
table_label = label
table_size = size
break
if table_label is None:
result.append(lines[i])
i += 1
continue
# Found a keyword table. Collect all data bytes and track labels.
raw_bytes = []
first_comment = None
# Map: byte offset within table -> label name
label_at_offset = {0: table_label}
j = i
while j < len(lines) and len(raw_bytes) < table_size:
line = lines[j]
ml = labeled_db_pat.match(line)
mu = db_pat.match(line)
mlds = labeled_ds_pat.match(line)
muds = ds_pat.match(line)
if ml or mu:
m = ml or mu
if first_comment is None and m.group(m.lastindex):
first_comment = m.group(m.lastindex)
if first_comment:
first_comment = first_comment.strip()
# Track label at current byte offset
if ml:
lm = label_pat.match(line)
if lm and lm.group(1) != table_label:
label_at_offset[len(raw_bytes)] = lm.group(1)
data_str = ml.group(2)
else:
data_str = mu.group(2)
raw_bytes.extend(extract_bytes_from_db(data_str))
j += 1
elif mlds or muds:
m = mlds or muds
if mlds:
lm = label_pat.match(line)
if lm and lm.group(1) != table_label:
label_at_offset[len(raw_bytes)] = lm.group(1)
count = int(mlds.group(2))
val = parse_hex_val(mlds.group(3))
else:
count = int(muds.group(2))
val = parse_hex_val(muds.group(3))
if val is not None:
raw_bytes.extend([val] * count)
j += 1
else:
break
# Safety: ensure we advance past the starting line
if j == i:
result.append(lines[i])
i += 1
continue
# Parse keywords from raw bytes, tracking byte offsets
keywords = [] # list of (byte_offset, [bytes])
kw = []
kw_start = 0
for idx, b in enumerate(raw_bytes[:table_size]):
kw.append(b)
if b & 0x80:
keywords.append((kw_start, kw))
kw = []
kw_start = idx + 1
if kw:
keywords.append((kw_start, kw))
# Output reformatted keyword table with labels preserved.
# Labels may fall at keyword boundaries OR mid-keyword.
indent = ' '
for ki, (kw_offset, kw_bytes) in enumerate(keywords):
comment = ''
if ki == 0 and first_comment:
comment = f" {first_comment}"
# Check for any labels within this keyword's byte range
kw_labels = {} # relative_offset -> label_name
for off in range(len(kw_bytes)):
abs_off = kw_offset + off
if abs_off in label_at_offset:
kw_labels[off] = label_at_offset[abs_off]
if not kw_labels:
# No labels in this keyword
result.append(f"{indent}{format_keyword(kw_bytes)}{comment}\n")
elif 0 in kw_labels and len(kw_labels) == 1:
# Label at keyword start only
lbl = kw_labels[0]
prefix = f"{lbl}:{indent[len(lbl)+1:]}"
result.append(f"{prefix}{format_keyword(kw_bytes)}{comment}\n")
else:
# Labels mid-keyword: split output at each label position.
# Build split points: 0, label_pos1, label_pos2, ..., end
split_pts = sorted(set([0] + list(kw_labels.keys())
+ [len(kw_bytes)]))
for si in range(len(split_pts) - 1):
seg_start = split_pts[si]
seg_end = split_pts[si + 1]
chunk = kw_bytes[seg_start:seg_end]
lbl = kw_labels.get(seg_start)
is_last_seg = (seg_end == len(kw_bytes))
if lbl:
prefix = f"{lbl}:{indent[len(lbl)+1:]}"
else:
prefix = indent
cmt = comment if si == 0 else ''
if is_last_seg and chunk and (chunk[-1] & 0x80):
result.append(
f"{prefix}{format_keyword(chunk)}{cmt}\n")
else:
parts = []
for b in chunk:
if 0x20 <= b <= 0x7E and b != 0x22:
parts.append(f"0{b:02X}H")
else:
parts.append(f"0{b:02X}H")
if parts:
result.append(
f"{prefix}DB {','.join(parts)}{cmt}\n")
i = j
return result
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <input.asm> [output.asm]")
print(" Post-processes dz80 output for GLASS Z80 assembler compatibility.")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2] if len(sys.argv) > 2 else input_file
with open(input_file, 'r') as f:
lines = f.readlines()
original_count = len(lines)
# Step 1: Collect defined labels and find undefined references
defined = collect_defined_labels(lines)
undefined = find_undefined_refs(lines, defined)
print(f" Defined labels: {len(defined)}")
print(f" Undefined Lxxxx references: {len(undefined)}")
# Step 2: Replace undefined Lxxxx references with numeric values
lines = [replace_undefined_refs(line, undefined) for line in lines]
# Step 3: Fix string quoting issues
lines = fix_string_issues(lines)
# Step 4: Compact consecutive same-value DB lines into DS
lines = compact_db_sequences(lines)
# Step 5: Reformat keyword tables (one keyword per line, bit-7 terminator shown)
# Must run BEFORE stringify so labels inside tables are preserved.
lines = format_keyword_tables(lines)
# Step 6: Convert runs of printable-ASCII DB bytes into DB "text" form
lines = stringify_db_runs(lines)
print(f" Lines: {original_count} -> {len(lines)} (compacted {original_count - len(lines)} lines)")
with open(output_file, 'w') as f:
f.writelines(lines)
print(f" Output: {output_file}")
if __name__ == '__main__':
main()