Do some work on the CPU, assembler still needs update

This commit is contained in:
2025-02-02 21:48:35 +01:00
parent b7e5e2aa35
commit 0a0f953f09
15 changed files with 1473 additions and 71 deletions

525
assembler/assembler.c Normal file
View File

@@ -0,0 +1,525 @@
//
// Created by bruno on 1.2.2025.
//
#include "assembler.h"
Label labels[MAX_LABELS];
int labelCount = 0;
//
// Helper functions for string manipulation
//
void trim(char *s) {
// Remove leading whitespace
while (isspace((unsigned char) *s)) s++;
// Remove trailing whitespace
char *end = s + strlen(s) - 1;
while (end > s && isspace((unsigned char) *end)) {
*end = '\0';
end--;
}
}
// Look up a label by name; returns -1 if not found.
int lookupLabel(const char *name) {
for (int i = 0; i < labelCount; i++) {
if (strcmp(labels[i].name, name) == 0)
return labels[i].address;
}
return -1;
}
// Add a label to the table
void addLabel(const char *name, int address) {
if (labelCount >= MAX_LABELS) {
fprintf(stderr, "Too many labels!\n");
exit(1);
}
strncpy(labels[labelCount].name, name, sizeof(labels[labelCount].name));
labels[labelCount].address = address;
labelCount++;
}
//
// Parse a register string (e.g., "R0", "R1", etc.) and return it's number.
// Returns -1 on error.
int parseRegister(const char *token) {
if (token[0] == 'R' || token[0] == 'r') {
int reg = atoi(token + 1);
if (reg >= 0 && reg < REG_COUNT)
return reg;
}
return -1;
}
// Parse an immediate value (supports decimal and 0x... hexadecimal)
uint8_t parseImmediate(const char *token) {
int value;
if (strlen(token) > 2 && token[0] == '0' && (token[1] == 'x' || token[1] == 'X'))
sscanf(token, "%x", &value);
else
sscanf(token, "%d", &value);
return (uint8_t) value;
}
void toUpperCase(char *string) {
while (*string) {
if (*string > 0x60 && *string < 0x7b) {
(*string) -= 0x20;
}
}
}
//
// Map an instruction mnemonic (string) to its opcode value and expected operand types.
// For simplicity, we will return the opcode value and then in our parser well decide how many operands to expect.
// (In a full assembler you might use a more sophisticated data structure.)
//
int getOpcode(char *mnemonic) {
toUpperCase(mnemonic);
if (strcmp(mnemonic, "BRK") == 0)
return BRK;
else if (strcmp(mnemonic, "NOP") == 0)
return NOP;
else if (strcmp(mnemonic, "MOV") == 0)
return -2; // Special case: we must decide between MOV_RN_IMM, MOV_RN_RM, MOV_RN_ADDR, MOV_ADDR_RN
else if (strcmp(mnemonic, "SWAP") == 0)
return SWAP;
else if (strcmp(mnemonic, "SWAPN") == 0)
return SWAPN;
else if (strcmp(mnemonic, "ADD") == 0)
return -3; // Special: decide between ADD_RN_RM and ADD_RN_IMM
else if (strcmp(mnemonic, "SUB") == 0)
return -4; // Special: decide between SUB_RN_RM and SUB_RN_IMM
else if (strcmp(mnemonic, "MUL") == 0)
return -5; // Special: decide between MUL_RN_RM and MUL_RN_IMM
else if (strcmp(mnemonic, "DIV") == 0)
return -6; // Special: decide between DIV_RN_RM and DIV_RN_IMM
else if (strcmp(mnemonic, "MOD") == 0)
return -7; // Special: decide between MOD_RN_RM and MOD_RN_IMM
else if (strcmp(mnemonic, "NEG") == 0)
return NEG_RN;
else if (strcmp(mnemonic, "AND") == 0)
return -8; // Special: decide between AND_RN_RM and AND_RN_IMM
else if (strcmp(mnemonic, "OR") == 0)
return -9; // Special: decide between OR_RN_RM and OR_RN_IMM
else if (strcmp(mnemonic, "XOR") == 0)
return -10; // Special: decide between XOR_RN_RM and XOR_RN_IMM
else if (strcmp(mnemonic, "NOT") == 0)
return NOT_RN;
else if (strcmp(mnemonic, "SHL") == 0)
return SHL_RN_IMM;
else if (strcmp(mnemonic, "SHR") == 0)
return SHR_RN_IMM;
else if (strcmp(mnemonic, "SAR") == 0)
return SAR_RN_IMM;
else if (strcmp(mnemonic, "JMP") == 0)
return JMP;
else if (strcmp(mnemonic, "CMP") == 0)
return CMP;
else if (strcmp(mnemonic, "JE") == 0)
return JE;
else if (strcmp(mnemonic, "JNE") == 0)
return JNE;
else if (strcmp(mnemonic, "JG") == 0)
return JG;
else if (strcmp(mnemonic, "JL") == 0)
return JL;
else if (strcmp(mnemonic, "JGE") == 0)
return JGE;
else if (strcmp(mnemonic, "JLE") == 0)
return JLE;
else if (strcmp(mnemonic, "CALL") == 0)
return CALL;
else if (strcmp(mnemonic, "RET") == 0)
return RET;
else if (strcmp(mnemonic, "PUSH") == 0)
return PUSH;
else if (strcmp(mnemonic, "POP") == 0)
return POP;
else if (strcmp(mnemonic, "PUSHF") == 0)
return PUSHF;
else if (strcmp(mnemonic, "POPF") == 0)
return POPF;
else {
return -1;
}
}
//
// In this simple assembler, some instructions share a mnemonic, and we must choose the correct opcode
// based on the type of the operand (register vs. immediate vs. memory).
// The following helper functions decide that, given two operands (as strings).
//
// For example, "MOV Rn, 42" should choose MOV_RN_IMM, while "MOV Rn, Rm" should choose MOV_RN_RM.
// We assume that memory addresses are written in square brackets, e.g. "[123]".
//
int resolveMOV(const char *dest, const char *src) {
// If dest starts with '[' then it is a memory destination.
if (dest[0] == '[') return MOV_ADDR_RN; // actually, MOV [Addr], Rn expects Rn in second operand
// Otherwise, dest is a register.
// Now, check src:
if (src[0] == 'R' || src[0] == 'r') {
return MOV_RN_RM;
} else if (src[0] == '[') {
return MOV_RN_ADDR;
} else {
return MOV_RN_IMM;
}
}
int resolveALU(int baseOpcode, const char *src) {
// baseOpcode is one of our special negative values for ADD, SUB, etc.
if (src[0] == 'R' || src[0] == 'r')
switch (baseOpcode) {
case -3:
return ADD_RN_RM;
case -4:
return SUB_RN_RM;
case -5:
return MUL_RN_RM;
case -6:
return DIV_RN_RM;
case -7:
return MOD_RN_RM;
case -8:
return AND_RN_RM;
case -9:
return OR_RN_RM;
case -10:
return XOR_RN_RM;
default:
return -1;
}
else
switch (baseOpcode) {
case -3:
return ADD_RN_IMM;
case -4:
return SUB_RN_IMM;
case -5:
return MUL_RN_IMM;
case -6:
return DIV_RN_IMM;
case -7:
return MOD_RN_IMM;
case -8:
return AND_RN_IMM;
case -9:
return OR_RN_IMM;
case -10:
return XOR_RN_IMM;
default:
return -1;
}
}
// Reads a single line from the source string.
const char *readLine(const char *source, char *buffer, size_t maxLen) {
size_t i = 0;
while (*source && *source != '\n' && i < maxLen - 1) {
buffer[i++] = *source++;
}
buffer[i] = '\0';
return (*source == '\n') ? source + 1 : source;
}
//
// The first pass scans the assembly source file to record all labels and their addresses.
// The address is simply the offset into the output machine code buffer.
// For this example, every instruction is assumed to have a fixed length (opcode plus operand bytes).
//
int firstPass(const char *source) {
char line[MAX_LINE_LENGTH];
int addr = 0;
const char *ptr = source;
while (*ptr) {
// Read a line from the source string
ptr = readLine(ptr, line, sizeof(line));
trim(line);
if (line[0] == '\0' || line[0] == ';' || line[0] == '#')
continue; // Skip empty or comment lines
char *colon = strchr(line, ':');
if (colon != NULL) {
*colon = '\0';
trim(line);
addLabel(line, addr);
char *rest = colon + 1;
trim(rest);
if (strlen(rest) == 0)
continue;
strcpy(line, rest);
}
// For simplicity, we assume each instruction (with its operands) takes a fixed number of bytes.
// Here we calculate the number of bytes by looking at the opcode mnemonic.
// (A more robust approach would have a table for instruction sizes.)
char mnemonic[32];
sscanf(line, "%31s", mnemonic);
int opcode = getOpcode(mnemonic);
if (opcode == -2) {
// MOV: two operands separated by comma
// e.g. MOV R1, 42
// We add 3 bytes: opcode, operand1, operand2.
addr += 3;
} else if (opcode == -3 || opcode == -4 || opcode == -5 || opcode == -6 ||
opcode == -7 || opcode == -8 || opcode == -9 || opcode == -10) {
// ALU instructions with two operands: 3 bytes.
addr += 3;
} else if (opcode == NEG_RN || opcode == SWAPN || opcode == NOT_RN) {
// One operand: 2 bytes.
addr += 2;
} else if (opcode == SWAP || opcode == CMP) {
// Two operands: 3 bytes.
addr += 3;
} else if (opcode == SHL_RN_IMM || opcode == SHR_RN_IMM ||
opcode == SAR_RN_IMM) {
addr += 3;
} else if (opcode == JMP || opcode == JE || opcode == JNE ||
opcode == JG || opcode == JL || opcode == JGE || opcode == JLE ||
opcode == CALL) {
// Jump or call: 2 bytes (opcode and one byte address/immediate).
addr += 2;
} else if (opcode == RET || opcode == PUSHF || opcode == POPF) {
addr += 1;
} else if (opcode == PUSH || opcode == POP) {
addr += 2;
} else {
// For other instructions, we assume 3 bytes.
addr += 3;
}
}
return addr;
}
//
// The second pass actually translates the assembly instructions to machine code.
// The machine code is written into the provided buffer. (It must be large enough.)
//
int secondPass(const char *source, uint8_t *code) {
char line[MAX_LINE_LENGTH];
int addr = 0;
const char *ptr = source;
while (*ptr) {
ptr = readLine(ptr, line, sizeof(line));
trim(line);
if (line[0] == '\0' || line[0] == ';' || line[0] == '#')
continue;
char *colon = strchr(line, ':');
if (colon != NULL) {
*colon = ' ';
}
if (strlen(line) == 0)
continue;
char *token = strtok(line, " ,");
if (!token)
continue;
char mnemonic[32];
strncpy(mnemonic, token, sizeof(mnemonic));
int opcode = getOpcode(mnemonic);
code[addr++] = opcode;
// Handle instructions that need operand disambiguation.
if (strcmp(mnemonic, "MOV") == 0) {
// Get first operand.
char *dest = strtok(NULL, " ,");
char *src = strtok(NULL, " ,");
if (!dest || !src) {
fprintf(stderr, "Error: MOV requires two operands.\n");
exit(1);
}
int opcode2 = resolveMOV(dest, src);
code[addr++] = opcode2;
// For the MOV instructions we decide that:
// - For MOV_RN_IMM: operand bytes: [register, immediate]
// - For MOV_RN_RM: operand bytes: [dest register, src register]
// - For MOV_RN_ADDR: operand bytes: [dest register, address]
// - For MOV_ADDR_RN: operand bytes: [address, register]
if (opcode2 == MOV_RN_IMM) {
int reg = parseRegister(dest);
uint8_t imm = parseImmediate(src);
code[addr++] = reg;
code[addr++] = imm;
} else if (opcode2 == MOV_RN_RM) {
int regDest = parseRegister(dest);
int regSrc = parseRegister(src);
code[addr++] = regDest;
code[addr++] = regSrc;
} else if (opcode2 == MOV_RN_ADDR) {
// src is memory reference like "[123]"
int regDest = parseRegister(dest);
// Remove the brackets.
char addrStr[32];
strncpy(addrStr, src + 1, strlen(src) - 2);
addrStr[strlen(src) - 2] = '\0';
uint8_t memAddr = parseImmediate(addrStr);
code[addr++] = regDest;
code[addr++] = memAddr;
} else if (opcode2 == MOV_ADDR_RN) {
// dest is a memory reference, src is a register.
// Remove brackets from dest.
char addrStr[32];
strncpy(addrStr, dest + 1, strlen(dest) - 2);
addrStr[strlen(dest) - 2] = '\0';
uint8_t memAddr = parseImmediate(addrStr);
int regSrc = parseRegister(src);
code[addr++] = memAddr;
code[addr++] = regSrc;
}
} else if (strcmp(mnemonic, "ADD") == 0 ||
strcmp(mnemonic, "SUB") == 0 ||
strcmp(mnemonic, "MUL") == 0 ||
strcmp(mnemonic, "DIV") == 0 ||
strcmp(mnemonic, "MOD") == 0 ||
strcmp(mnemonic, "AND") == 0 ||
strcmp(mnemonic, "OR") == 0 ||
strcmp(mnemonic, "XOR") == 0) {
// ALU instructions with two operands.
char *dest = strtok(NULL, " ,");
char *src = strtok(NULL, " ,");
if (!dest || !src) {
fprintf(stderr, "Error: %s requires two operands.\n", mnemonic);
exit(1);
}
int baseOpcode;
if (strcmp(mnemonic, "ADD") == 0) baseOpcode = -3;
else if (strcmp(mnemonic, "SUB") == 0) baseOpcode = -4;
else if (strcmp(mnemonic, "MUL") == 0) baseOpcode = -5;
else if (strcmp(mnemonic, "DIV") == 0) baseOpcode = -6;
else if (strcmp(mnemonic, "MOD") == 0) baseOpcode = -7;
else if (strcmp(mnemonic, "AND") == 0) baseOpcode = -8;
else if (strcmp(mnemonic, "OR") == 0) baseOpcode = -9;
else if (strcmp(mnemonic, "XOR") == 0) baseOpcode = -10;
else baseOpcode = -1;
int opcode3 = resolveALU(baseOpcode, src);
code[addr++] = opcode3;
int regDest = parseRegister(dest);
code[addr++] = regDest;
// For a register source, encode the register; for an immediate, encode the value.
if (src[0] == 'R' || src[0] == 'r') {
int regSrc = parseRegister(src);
code[addr++] = regSrc;
} else {
uint8_t imm = parseImmediate(src);
code[addr++] = imm;
}
} else if (strcmp(mnemonic, "NEG") == 0 ||
strcmp(mnemonic, "SWAPN") == 0 ||
strcmp(mnemonic, "NOT") == 0) {
// One operand instructions.
char *op = strtok(NULL, " ,");
if (!op) {
fprintf(stderr, "Error: %s requires one operand.\n", mnemonic);
exit(1);
}
int opcode4 = getOpcode(mnemonic);
code[addr++] = opcode4;
int reg = parseRegister(op);
code[addr++] = reg;
} else if (strcmp(mnemonic, "SWAP") == 0 || strcmp(mnemonic, "CMP") == 0) {
// Two operand instructions: both registers.
char *op1 = strtok(NULL, " ,");
char *op2 = strtok(NULL, " ,");
if (!op1 || !op2) {
fprintf(stderr, "Error: %s requires two operands.\n", mnemonic);
exit(1);
}
int opcode5 = getOpcode(mnemonic);
code[addr++] = opcode5;
int r1 = parseRegister(op1);
int r2 = parseRegister(op2);
code[addr++] = r1;
code[addr++] = r2;
} else if (strcmp(mnemonic, "SHL") == 0 ||
strcmp(mnemonic, "SHR") == 0 ||
strcmp(mnemonic, "SAR") == 0 ||
strcmp(mnemonic, "SHRS") == 0) {
// Shift instructions: one register operand and one immediate.
char *regToken = strtok(NULL, " ,");
char *immToken = strtok(NULL, " ,");
if (!regToken || !immToken) {
fprintf(stderr, "Error: %s requires two operands.\n", mnemonic);
exit(1);
}
int opcode6 = getOpcode(mnemonic);
code[addr++] = opcode6;
int reg = parseRegister(regToken);
code[addr++] = reg;
uint8_t imm = parseImmediate(immToken);
code[addr++] = imm;
} else if (strcmp(mnemonic, "JMP") == 0 ||
strcmp(mnemonic, "JE") == 0 ||
strcmp(mnemonic, "JNE") == 0 ||
strcmp(mnemonic, "JG") == 0 ||
strcmp(mnemonic, "JL") == 0 ||
strcmp(mnemonic, "JGE") == 0 ||
strcmp(mnemonic, "JLE") == 0 ||
strcmp(mnemonic, "CALL") == 0) {
// Jump instructions: one operand which may be a label or an immediate address.
char *operand = strtok(NULL, " ,");
if (!operand) {
fprintf(stderr, "Error: %s requires an operand.\n", mnemonic);
exit(1);
}
int opcode7 = getOpcode(mnemonic);
code[addr++] = opcode7;
// If the operand is not a number, assume it is a label.
if (!isdigit(operand[0])) {
int labelAddr = lookupLabel(operand);
if (labelAddr < 0) {
fprintf(stderr, "Error: undefined label '%s'\n", operand);
exit(1);
}
code[addr++] = (uint8_t) labelAddr;
} else {
uint8_t imm = parseImmediate(operand);
code[addr++] = imm;
}
} else if (strcmp(mnemonic, "RET") == 0 ||
strcmp(mnemonic, "PUSHF") == 0 ||
strcmp(mnemonic, "POPF") == 0) {
// Instructions with no operand.
int opcode8 = getOpcode(mnemonic);
code[addr++] = opcode8;
} else if (strcmp(mnemonic, "PUSH") == 0 ||
strcmp(mnemonic, "POP") == 0) {
// One operand (a register)
char *regToken = strtok(NULL, " ,");
if (!regToken) {
fprintf(stderr, "Error: %s requires a register operand.\n", mnemonic);
exit(1);
}
int opcode9 = getOpcode(mnemonic);
code[addr++] = opcode9;
int reg = parseRegister(regToken);
code[addr++] = reg;
} else {
fprintf(stderr, "Error: Unknown instruction '%s'\n", mnemonic);
exit(1);
}
}
return addr;
}
void completePass(const char *input, CPU *cpu) {
// First pass: determine label addresses.
firstPass(input);
memset(cpu->memory, 0, MEM_SIZE);
// Second pass: generate machine code.
secondPass(input, cpu->memory);
}