565 lines
21 KiB
Python
Vendored
565 lines
21 KiB
Python
Vendored
#!/usr/bin/env python3
|
|
#########################################################################################################
|
|
##
|
|
## Name: postprocess_dz80.py
|
|
## Created: March 2026
|
|
## Author(s): Philip Smart
|
|
## Description: Post-processes dz80 disassembly output for GLASS Z80 assembler compatibility.
|
|
## Adapted for MZ-5Z008 Disk BASIC.
|
|
##
|
|
## Fixes:
|
|
## 1. Undefined Lxxxx label references -> numeric hex values
|
|
## 2. Compacts consecutive same-value DB lines into DS count, value
|
|
## 3. Fixes any string quoting issues
|
|
## 4. Reformats BASIC keyword tables (one keyword per line)
|
|
## 5. Converts runs of printable-ASCII DB bytes into DB "text" form
|
|
##
|
|
## Credits:
|
|
## Copyright: (c) 2026 Philip Smart <philip.smart@net2net.org>
|
|
##
|
|
## History: March 2026 - Initial script (based on MZ-1Z-013B version).
|
|
##
|
|
#########################################################################################################
|
|
## This source file is free software: you can redistribute it and#or modify
|
|
## it under the terms of the GNU General Public License as published
|
|
## by the Free Software Foundation, either version 3 of the License, or
|
|
## (at your option) any later version.
|
|
##
|
|
## This source file is distributed in the hope that it will be useful,
|
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
## GNU General Public License for more details.
|
|
##
|
|
## You should have received a copy of the GNU General Public License
|
|
## along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#########################################################################################################
|
|
|
|
import re
|
|
import sys
|
|
import os
|
|
|
|
def collect_defined_labels(lines):
|
|
"""Collect all labels defined in the assembly file."""
|
|
defined = set()
|
|
for line in lines:
|
|
# Label definitions (LABEL: instruction)
|
|
m = re.match(r'^([\w\$\.\?][\w\$\.\?]*)\s*:', line)
|
|
if m:
|
|
defined.add(m.group(1))
|
|
# EQU definitions (LABEL EQU value)
|
|
m = re.match(r'^(\S+)\s+EQU\s+', line)
|
|
if m:
|
|
defined.add(m.group(1))
|
|
return defined
|
|
|
|
def find_undefined_refs(lines, defined):
|
|
"""Find all Lxxxx references that are not defined."""
|
|
undefined = set()
|
|
for line in lines:
|
|
for m in re.finditer(r'\bL([0-9A-Fa-f]{4})\b', line):
|
|
label = 'L' + m.group(1).upper()
|
|
if label not in defined:
|
|
undefined.add(label)
|
|
return undefined
|
|
|
|
def replace_undefined_refs(line, undefined):
|
|
"""Replace undefined Lxxxx references with numeric hex values."""
|
|
def replacer(m):
|
|
label = 'L' + m.group(1).upper()
|
|
if label in undefined:
|
|
addr = int(m.group(1), 16)
|
|
return f"0{addr:04X}H"
|
|
return m.group(0)
|
|
return re.sub(r'\bL([0-9A-Fa-f]{4})\b', replacer, line)
|
|
|
|
def compact_db_sequences(lines):
|
|
"""Compact consecutive DB lines with the same value into DS count, value.
|
|
|
|
Handles both labeled and unlabeled DB lines. A labeled DB starts a new
|
|
DS group (preserving the label); subsequent unlabeled DBs of the same
|
|
value are folded into that group. Consecutive labeled DBs with the same
|
|
value each get their own DS line (since labels must be preserved).
|
|
Minimum run length for compaction is 2.
|
|
"""
|
|
result = []
|
|
i = 0
|
|
# Regex to match unlabeled DB line with a single hex byte value
|
|
db_pat = re.compile(r'^(\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
|
|
# Regex to match labeled DB line
|
|
labeled_db_pat = re.compile(r'^([\w\$\.\?][\w\$\.\?]*:\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
|
|
|
|
def parse_db(line):
|
|
"""Return (prefix, value, comment, is_labeled) or None."""
|
|
m = labeled_db_pat.match(line)
|
|
if m:
|
|
return (m.group(1), m.group(2).upper(), m.group(3), True)
|
|
m = db_pat.match(line)
|
|
if m:
|
|
return (m.group(1), m.group(2).upper(), m.group(3), False)
|
|
return None
|
|
|
|
while i < len(lines):
|
|
parsed = parse_db(lines[i])
|
|
if parsed is None:
|
|
result.append(lines[i])
|
|
i += 1
|
|
continue
|
|
|
|
prefix, value, comment, is_labeled = parsed
|
|
|
|
# Scan ahead: count consecutive unlabeled DB lines with the same value
|
|
j = i + 1
|
|
while j < len(lines):
|
|
p2 = parse_db(lines[j])
|
|
if p2 and p2[1] == value and not p2[3]:
|
|
j += 1
|
|
else:
|
|
break
|
|
|
|
count = j - i # total lines in this run (first + unlabeled continuations)
|
|
|
|
if count >= 2:
|
|
# Compact to DS count, value
|
|
if comment:
|
|
result.append(f"{prefix}DS {count}, {value} {comment}\n")
|
|
else:
|
|
result.append(f"{prefix}DS {count}, {value}\n")
|
|
i = j
|
|
else:
|
|
result.append(lines[i])
|
|
i += 1
|
|
|
|
return result
|
|
|
|
def fix_string_issues(lines):
|
|
"""Fix DB lines with problematic string quoting."""
|
|
result = []
|
|
for line in lines:
|
|
# Check for DB lines with multiple quoted strings that might have issues
|
|
if 'DB' in line and line.count('"') > 2:
|
|
# Check if there are unbalanced or nested quotes
|
|
m = re.match(r'^(\s+)DB\s+(.*)', line)
|
|
if m:
|
|
indent = m.group(1)
|
|
content = m.group(2).rstrip()
|
|
# Try to detect nested quote issues
|
|
in_string = False
|
|
quote_count = 0
|
|
for ch in content:
|
|
if ch == '"':
|
|
quote_count += 1
|
|
if quote_count % 2 != 0:
|
|
# Odd number of quotes = broken string, convert all to hex
|
|
result.append(line) # Keep as-is, should be caught by assembler
|
|
continue
|
|
result.append(line)
|
|
return result
|
|
|
|
def stringify_db_runs(lines):
|
|
"""Convert runs of consecutive printable-ASCII DB bytes into DB "text" form.
|
|
|
|
Scans for sequences of unlabeled DB lines where each byte is printable ASCII
|
|
(0x20-0x7E). Runs of 3+ printable bytes are merged into a single DB "text"
|
|
line. Non-printable bytes between runs are kept as hex. Labeled DB lines
|
|
start a new output line (preserving the label).
|
|
"""
|
|
result = []
|
|
# Pattern for unlabeled single-byte DB
|
|
db_pat = re.compile(r'^(\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
|
|
# Pattern for labeled single-byte DB
|
|
labeled_db_pat = re.compile(
|
|
r'^([\w\$\.\?][\w\$\.\?]*:\s+)DB\s+(0[0-9A-Fa-f]{2}H)\s*(;.*)?$')
|
|
|
|
def parse_db(line):
|
|
"""Return (prefix, byte_val, comment, is_labeled) or None."""
|
|
m = labeled_db_pat.match(line)
|
|
if m:
|
|
val = int(m.group(2)[1:3], 16) # strip leading 0 and trailing H
|
|
return (m.group(1), val, m.group(3), True)
|
|
m = db_pat.match(line)
|
|
if m:
|
|
val = int(m.group(2)[1:3], 16)
|
|
return (m.group(1), val, m.group(3), False)
|
|
return None
|
|
|
|
def is_printable(b):
|
|
# Printable ASCII excluding double-quote (breaks string literals)
|
|
return 0x20 <= b <= 0x7E and b != 0x22
|
|
|
|
def flush_group(prefix, values, comment):
|
|
"""Convert a list of byte values into a DB line mixing "text" and hex."""
|
|
parts = []
|
|
text_run = []
|
|
for b in values:
|
|
if is_printable(b):
|
|
text_run.append(chr(b))
|
|
else:
|
|
if len(text_run) >= 3:
|
|
parts.append('"' + ''.join(text_run) + '"')
|
|
else:
|
|
for ch_b in text_run:
|
|
parts.append(f"0{ord(ch_b):02X}H")
|
|
text_run = []
|
|
parts.append(f"0{b:02X}H")
|
|
if len(text_run) >= 3:
|
|
parts.append('"' + ''.join(text_run) + '"')
|
|
else:
|
|
for ch_b in text_run:
|
|
parts.append(f"0{ord(ch_b):02X}H")
|
|
|
|
line_str = f"{prefix}DB {','.join(parts)}"
|
|
if comment:
|
|
line_str += f" {comment}"
|
|
return line_str + "\n"
|
|
|
|
i = 0
|
|
while i < len(lines):
|
|
parsed = parse_db(lines[i])
|
|
if parsed is None:
|
|
result.append(lines[i])
|
|
i += 1
|
|
continue
|
|
|
|
prefix, byte_val, comment, is_labeled = parsed
|
|
|
|
# Collect a run of DB bytes (stop at labeled lines or non-DB lines)
|
|
group_prefix = prefix
|
|
group_comment = comment
|
|
values = [byte_val]
|
|
j = i + 1
|
|
while j < len(lines):
|
|
p2 = parse_db(lines[j])
|
|
if p2 is None:
|
|
break
|
|
if p2[3]: # labeled line - stop this group
|
|
break
|
|
values.append(p2[1])
|
|
j += 1
|
|
|
|
# Only stringify if there are 3+ printable ASCII bytes in the group
|
|
printable_count = sum(1 for b in values if is_printable(b))
|
|
if printable_count >= 3 and len(values) >= 3:
|
|
result.append(flush_group(group_prefix, values, group_comment))
|
|
i = j
|
|
else:
|
|
result.append(lines[i])
|
|
i += 1
|
|
|
|
return result
|
|
|
|
def format_keyword_tables(lines):
|
|
"""Reformat BASIC keyword tables so each keyword is on its own line.
|
|
|
|
Keywords are bit-7 terminated: the last character of each keyword has
|
|
bit 7 set. 0x80 alone represents an empty/unused slot.
|
|
|
|
Output format:
|
|
DB "GOT", "O" | 080H ; keyword with multiple chars
|
|
DB "A" | 080H ; single-char keyword
|
|
DB 080H ; empty slot
|
|
"""
|
|
# Keyword table markers: label -> number of data bytes in table
|
|
# MZ-5Z008 keyword tables
|
|
KW_TABLES = {
|
|
'L415B': 411, # Statement keyword text table (dz80 label)
|
|
'STMTWORDTBL': 411, # Statement keyword text table (equate label)
|
|
'L42F6': 88, # Extended keyword text table (dz80 label)
|
|
'EXTWORDTBL': 88, # Extended keyword text table (equate label)
|
|
'L434E': 158, # Function keyword text table (dz80 label)
|
|
'FUNCWORDTBL': 158, # Function keyword text table (equate label)
|
|
}
|
|
|
|
# Build EQU symbol table for resolving symbol names in DB values
|
|
equ_values = {}
|
|
equ_pat = re.compile(r'^(\S+)\s+EQU\s+(0[0-9A-Fa-f]+H)', re.IGNORECASE)
|
|
for line in lines:
|
|
m = equ_pat.match(line)
|
|
if m:
|
|
equ_values[m.group(1)] = int(m.group(2)[:-1], 16)
|
|
|
|
# Regex patterns for DB lines (allow zero or more whitespace after label colon)
|
|
db_pat = re.compile(r'^(\s+)DB\s+(.*?)(\s*;.*)?$')
|
|
labeled_db_pat = re.compile(
|
|
r'^([\w\$\.\?][\w\$\.\?]*:\s*)DB\s+(.*?)(\s*;.*)?$')
|
|
ds_pat = re.compile(r'^(\s+)DS\s+(\d+),\s*(0[0-9A-Fa-f]{2}H)(\s*;.*)?$')
|
|
labeled_ds_pat = re.compile(
|
|
r'^([\w\$\.\?][\w\$\.\?]*:\s*)DS\s+(\d+),\s*(0[0-9A-Fa-f]{2}H)(\s*;.*)?$')
|
|
|
|
def parse_hex_val(s):
|
|
"""Parse '0xxH' to int."""
|
|
s = s.strip()
|
|
if s.upper().endswith('H') and s.startswith('0'):
|
|
return int(s[:-1], 16)
|
|
return None
|
|
|
|
def extract_bytes_from_db(data_str):
|
|
"""Extract byte values from a DB operand string (handles hex, strings, DS)."""
|
|
vals = []
|
|
# Split on commas but respect quoted strings
|
|
parts = []
|
|
current = ''
|
|
in_str = False
|
|
for ch in data_str:
|
|
if ch == '"':
|
|
in_str = not in_str
|
|
current += ch
|
|
elif ch == ',' and not in_str:
|
|
parts.append(current.strip())
|
|
current = ''
|
|
else:
|
|
current += ch
|
|
if current.strip():
|
|
parts.append(current.strip())
|
|
|
|
for part in parts:
|
|
part = part.strip()
|
|
if part.startswith('"') and part.endswith('"'):
|
|
for ch in part[1:-1]:
|
|
vals.append(ord(ch))
|
|
elif '|' in part:
|
|
# expression like "X" | 080H
|
|
sub = [s.strip() for s in part.split('|')]
|
|
val = 0
|
|
for s in sub:
|
|
if s.startswith('"') and s.endswith('"'):
|
|
val |= ord(s[1])
|
|
else:
|
|
v = parse_hex_val(s)
|
|
if v is not None:
|
|
val |= v
|
|
vals.append(val)
|
|
else:
|
|
v = parse_hex_val(part)
|
|
if v is not None:
|
|
vals.append(v)
|
|
elif part in equ_values and equ_values[part] <= 0xFF:
|
|
vals.append(equ_values[part])
|
|
# else: unknown symbol, skip
|
|
return vals
|
|
|
|
def format_keyword(byte_list):
|
|
"""Format one keyword's bytes as a DB line."""
|
|
if len(byte_list) == 1 and byte_list[0] == 0x80:
|
|
return "DB 080H"
|
|
if len(byte_list) == 1:
|
|
b = byte_list[0]
|
|
ch = b & 0x7F
|
|
if 0x20 <= ch <= 0x7E and ch != 0x22:
|
|
return f'DB "{chr(ch)}" | 080H'
|
|
else:
|
|
return f"DB 0{b:02X}H"
|
|
|
|
# Multiple bytes: all but last are plain ASCII, last has bit 7
|
|
prefix_bytes = byte_list[:-1]
|
|
last_byte = byte_list[-1]
|
|
last_ch = last_byte & 0x7F
|
|
|
|
prefix_str = ''.join(chr(b) for b in prefix_bytes
|
|
if 0x20 <= b <= 0x7E and b != 0x22)
|
|
|
|
if len(prefix_str) == len(prefix_bytes) and prefix_str:
|
|
# All prefix bytes are printable
|
|
if 0x20 <= last_ch <= 0x7E and last_ch != 0x22:
|
|
return f'DB "{prefix_str}", "{chr(last_ch)}" | 080H'
|
|
else:
|
|
return f'DB "{prefix_str}", 0{last_byte:02X}H'
|
|
else:
|
|
# Some non-printable in prefix, fall back to hex
|
|
parts = [f"0{b:02X}H" for b in prefix_bytes]
|
|
if 0x20 <= last_ch <= 0x7E and last_ch != 0x22:
|
|
parts.append(f'"{chr(last_ch)}" | 080H')
|
|
else:
|
|
parts.append(f"0{last_byte:02X}H")
|
|
return f"DB {','.join(parts)}"
|
|
|
|
# Regex to extract label from a line
|
|
label_pat = re.compile(r'^([\w\$\.\?][\w\$\.\?]*):\s')
|
|
|
|
result = []
|
|
i = 0
|
|
while i < len(lines):
|
|
# Check if this line starts a keyword table
|
|
table_label = None
|
|
table_size = 0
|
|
for label, size in KW_TABLES.items():
|
|
if lines[i].startswith(label + ':'):
|
|
table_label = label
|
|
table_size = size
|
|
break
|
|
|
|
if table_label is None:
|
|
result.append(lines[i])
|
|
i += 1
|
|
continue
|
|
|
|
# Found a keyword table. Collect all data bytes and track labels.
|
|
raw_bytes = []
|
|
first_comment = None
|
|
# Map: byte offset within table -> label name
|
|
label_at_offset = {0: table_label}
|
|
j = i
|
|
while j < len(lines) and len(raw_bytes) < table_size:
|
|
line = lines[j]
|
|
ml = labeled_db_pat.match(line)
|
|
mu = db_pat.match(line)
|
|
mlds = labeled_ds_pat.match(line)
|
|
muds = ds_pat.match(line)
|
|
|
|
if ml or mu:
|
|
m = ml or mu
|
|
if first_comment is None and m.group(m.lastindex):
|
|
first_comment = m.group(m.lastindex)
|
|
if first_comment:
|
|
first_comment = first_comment.strip()
|
|
# Track label at current byte offset
|
|
if ml:
|
|
lm = label_pat.match(line)
|
|
if lm and lm.group(1) != table_label:
|
|
label_at_offset[len(raw_bytes)] = lm.group(1)
|
|
data_str = ml.group(2)
|
|
else:
|
|
data_str = mu.group(2)
|
|
raw_bytes.extend(extract_bytes_from_db(data_str))
|
|
j += 1
|
|
elif mlds or muds:
|
|
m = mlds or muds
|
|
if mlds:
|
|
lm = label_pat.match(line)
|
|
if lm and lm.group(1) != table_label:
|
|
label_at_offset[len(raw_bytes)] = lm.group(1)
|
|
count = int(mlds.group(2))
|
|
val = parse_hex_val(mlds.group(3))
|
|
else:
|
|
count = int(muds.group(2))
|
|
val = parse_hex_val(muds.group(3))
|
|
if val is not None:
|
|
raw_bytes.extend([val] * count)
|
|
j += 1
|
|
else:
|
|
break
|
|
|
|
# Safety: ensure we advance past the starting line
|
|
if j == i:
|
|
result.append(lines[i])
|
|
i += 1
|
|
continue
|
|
|
|
# Parse keywords from raw bytes, tracking byte offsets
|
|
keywords = [] # list of (byte_offset, [bytes])
|
|
kw = []
|
|
kw_start = 0
|
|
for idx, b in enumerate(raw_bytes[:table_size]):
|
|
kw.append(b)
|
|
if b & 0x80:
|
|
keywords.append((kw_start, kw))
|
|
kw = []
|
|
kw_start = idx + 1
|
|
if kw:
|
|
keywords.append((kw_start, kw))
|
|
|
|
# Output reformatted keyword table with labels preserved.
|
|
# Labels may fall at keyword boundaries OR mid-keyword.
|
|
indent = ' '
|
|
for ki, (kw_offset, kw_bytes) in enumerate(keywords):
|
|
comment = ''
|
|
if ki == 0 and first_comment:
|
|
comment = f" {first_comment}"
|
|
|
|
# Check for any labels within this keyword's byte range
|
|
kw_labels = {} # relative_offset -> label_name
|
|
for off in range(len(kw_bytes)):
|
|
abs_off = kw_offset + off
|
|
if abs_off in label_at_offset:
|
|
kw_labels[off] = label_at_offset[abs_off]
|
|
|
|
if not kw_labels:
|
|
# No labels in this keyword
|
|
result.append(f"{indent}{format_keyword(kw_bytes)}{comment}\n")
|
|
elif 0 in kw_labels and len(kw_labels) == 1:
|
|
# Label at keyword start only
|
|
lbl = kw_labels[0]
|
|
prefix = f"{lbl}:{indent[len(lbl)+1:]}"
|
|
result.append(f"{prefix}{format_keyword(kw_bytes)}{comment}\n")
|
|
else:
|
|
# Labels mid-keyword: split output at each label position.
|
|
# Build split points: 0, label_pos1, label_pos2, ..., end
|
|
split_pts = sorted(set([0] + list(kw_labels.keys())
|
|
+ [len(kw_bytes)]))
|
|
for si in range(len(split_pts) - 1):
|
|
seg_start = split_pts[si]
|
|
seg_end = split_pts[si + 1]
|
|
chunk = kw_bytes[seg_start:seg_end]
|
|
lbl = kw_labels.get(seg_start)
|
|
is_last_seg = (seg_end == len(kw_bytes))
|
|
|
|
if lbl:
|
|
prefix = f"{lbl}:{indent[len(lbl)+1:]}"
|
|
else:
|
|
prefix = indent
|
|
|
|
cmt = comment if si == 0 else ''
|
|
|
|
if is_last_seg and chunk and (chunk[-1] & 0x80):
|
|
result.append(
|
|
f"{prefix}{format_keyword(chunk)}{cmt}\n")
|
|
else:
|
|
parts = []
|
|
for b in chunk:
|
|
if 0x20 <= b <= 0x7E and b != 0x22:
|
|
parts.append(f"0{b:02X}H")
|
|
else:
|
|
parts.append(f"0{b:02X}H")
|
|
if parts:
|
|
result.append(
|
|
f"{prefix}DB {','.join(parts)}{cmt}\n")
|
|
|
|
i = j
|
|
|
|
return result
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print(f"Usage: {sys.argv[0]} <input.asm> [output.asm]")
|
|
print(" Post-processes dz80 output for GLASS Z80 assembler compatibility.")
|
|
sys.exit(1)
|
|
|
|
input_file = sys.argv[1]
|
|
output_file = sys.argv[2] if len(sys.argv) > 2 else input_file
|
|
|
|
with open(input_file, 'r') as f:
|
|
lines = f.readlines()
|
|
|
|
original_count = len(lines)
|
|
|
|
# Step 1: Collect defined labels and find undefined references
|
|
defined = collect_defined_labels(lines)
|
|
undefined = find_undefined_refs(lines, defined)
|
|
print(f" Defined labels: {len(defined)}")
|
|
print(f" Undefined Lxxxx references: {len(undefined)}")
|
|
|
|
# Step 2: Replace undefined Lxxxx references with numeric values
|
|
lines = [replace_undefined_refs(line, undefined) for line in lines]
|
|
|
|
# Step 3: Fix string quoting issues
|
|
lines = fix_string_issues(lines)
|
|
|
|
# Step 4: Compact consecutive same-value DB lines into DS
|
|
lines = compact_db_sequences(lines)
|
|
|
|
# Step 5: Reformat keyword tables (one keyword per line, bit-7 terminator shown)
|
|
# Must run BEFORE stringify so labels inside tables are preserved.
|
|
lines = format_keyword_tables(lines)
|
|
|
|
# Step 6: Convert runs of printable-ASCII DB bytes into DB "text" form
|
|
lines = stringify_db_runs(lines)
|
|
|
|
print(f" Lines: {original_count} -> {len(lines)} (compacted {original_count - len(lines)} lines)")
|
|
|
|
with open(output_file, 'w') as f:
|
|
f.writelines(lines)
|
|
|
|
print(f" Output: {output_file}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|