Files
RISC-B/assembler/assembler.c

525 lines
19 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//
// Created by bruno on 1.2.2025.
//
#include "assembler.h"
Label labels[MAX_LABELS];
int labelCount = 0;
//
// Helper functions for string manipulation
//
void trim(char *s) {
// Remove leading whitespace
while (isspace((unsigned char) *s)) s++;
// Remove trailing whitespace
char *end = s + strlen(s) - 1;
while (end > s && isspace((unsigned char) *end)) {
*end = '\0';
end--;
}
}
// Look up a label by name; returns -1 if not found.
int lookupLabel(const char *name) {
for (int i = 0; i < labelCount; i++) {
if (strcmp(labels[i].name, name) == 0)
return labels[i].address;
}
return -1;
}
// Add a label to the table
void addLabel(const char *name, int address) {
if (labelCount >= MAX_LABELS) {
fprintf(stderr, "Too many labels!\n");
exit(1);
}
strncpy(labels[labelCount].name, name, sizeof(labels[labelCount].name));
labels[labelCount].address = address;
labelCount++;
}
//
// Parse a register string (e.g., "R0", "R1", etc.) and return it's number.
// Returns -1 on error.
int parseRegister(const char *token) {
if (token[0] == 'R' || token[0] == 'r') {
int reg = atoi(token + 1);
if (reg >= 0 && reg < REG_COUNT)
return reg;
}
return -1;
}
// Parse an immediate value (supports decimal and 0x... hexadecimal)
uint8_t parseImmediate(const char *token) {
int value;
if (strlen(token) > 2 && token[0] == '0' && (token[1] == 'x' || token[1] == 'X'))
sscanf(token, "%x", &value);
else
sscanf(token, "%d", &value);
return (uint8_t) value;
}
void toUpperCase(char *string) {
while (*string) {
if (*string > 0x60 && *string < 0x7b) {
(*string) -= 0x20;
}
}
}
//
// Map an instruction mnemonic (string) to its opcode value and expected operand types.
// For simplicity, we will return the opcode value and then in our parser well decide how many operands to expect.
// (In a full assembler you might use a more sophisticated data structure.)
//
int getOpcode(char *mnemonic) {
toUpperCase(mnemonic);
if (strcmp(mnemonic, "BRK") == 0)
return BRK;
else if (strcmp(mnemonic, "NOP") == 0)
return NOP;
else if (strcmp(mnemonic, "MOV") == 0)
return -2; // Special case: we must decide between MOV_RN_IMM, MOV_RN_RM, MOV_RN_ADDR, MOV_ADDR_RN
else if (strcmp(mnemonic, "SWAP") == 0)
return SWAP;
else if (strcmp(mnemonic, "SWAPN") == 0)
return SWAPN;
else if (strcmp(mnemonic, "ADD") == 0)
return -3; // Special: decide between ADD_RN_RM and ADD_RN_IMM
else if (strcmp(mnemonic, "SUB") == 0)
return -4; // Special: decide between SUB_RN_RM and SUB_RN_IMM
else if (strcmp(mnemonic, "MUL") == 0)
return -5; // Special: decide between MUL_RN_RM and MUL_RN_IMM
else if (strcmp(mnemonic, "DIV") == 0)
return -6; // Special: decide between DIV_RN_RM and DIV_RN_IMM
else if (strcmp(mnemonic, "MOD") == 0)
return -7; // Special: decide between MOD_RN_RM and MOD_RN_IMM
else if (strcmp(mnemonic, "NEG") == 0)
return NEG_RN;
else if (strcmp(mnemonic, "AND") == 0)
return -8; // Special: decide between AND_RN_RM and AND_RN_IMM
else if (strcmp(mnemonic, "OR") == 0)
return -9; // Special: decide between OR_RN_RM and OR_RN_IMM
else if (strcmp(mnemonic, "XOR") == 0)
return -10; // Special: decide between XOR_RN_RM and XOR_RN_IMM
else if (strcmp(mnemonic, "NOT") == 0)
return NOT_RN;
else if (strcmp(mnemonic, "SHL") == 0)
return SHL_RN_IMM;
else if (strcmp(mnemonic, "SHR") == 0)
return SHR_RN_IMM;
else if (strcmp(mnemonic, "SAR") == 0)
return SAR_RN_IMM;
else if (strcmp(mnemonic, "JMP") == 0)
return JMP;
else if (strcmp(mnemonic, "CMP") == 0)
return CMP;
else if (strcmp(mnemonic, "JE") == 0)
return JE;
else if (strcmp(mnemonic, "JNE") == 0)
return JNE;
else if (strcmp(mnemonic, "JG") == 0)
return JG;
else if (strcmp(mnemonic, "JL") == 0)
return JL;
else if (strcmp(mnemonic, "JGE") == 0)
return JGE;
else if (strcmp(mnemonic, "JLE") == 0)
return JLE;
else if (strcmp(mnemonic, "CALL") == 0)
return CALL;
else if (strcmp(mnemonic, "RET") == 0)
return RET;
else if (strcmp(mnemonic, "PUSH") == 0)
return PUSH;
else if (strcmp(mnemonic, "POP") == 0)
return POP;
else if (strcmp(mnemonic, "PUSHF") == 0)
return PUSHF;
else if (strcmp(mnemonic, "POPF") == 0)
return POPF;
else {
return -1;
}
}
//
// In this simple assembler, some instructions share a mnemonic, and we must choose the correct opcode
// based on the type of the operand (register vs. immediate vs. memory).
// The following helper functions decide that, given two operands (as strings).
//
// For example, "MOV Rn, 42" should choose MOV_RN_IMM, while "MOV Rn, Rm" should choose MOV_RN_RM.
// We assume that memory addresses are written in square brackets, e.g. "[123]".
//
int resolveMOV(const char *dest, const char *src) {
// If dest starts with '[' then it is a memory destination.
if (dest[0] == '[') return MOV_ADDR_RN; // actually, MOV [Addr], Rn expects Rn in second operand
// Otherwise, dest is a register.
// Now, check src:
if (src[0] == 'R' || src[0] == 'r') {
return MOV_RN_RM;
} else if (src[0] == '[') {
return MOV_RN_ADDR;
} else {
return MOV_RN_IMM;
}
}
int resolveALU(int baseOpcode, const char *src) {
// baseOpcode is one of our special negative values for ADD, SUB, etc.
if (src[0] == 'R' || src[0] == 'r')
switch (baseOpcode) {
case -3:
return ADD_RN_RM;
case -4:
return SUB_RN_RM;
case -5:
return MUL_RN_RM;
case -6:
return DIV_RN_RM;
case -7:
return MOD_RN_RM;
case -8:
return AND_RN_RM;
case -9:
return OR_RN_RM;
case -10:
return XOR_RN_RM;
default:
return -1;
}
else
switch (baseOpcode) {
case -3:
return ADD_RN_IMM;
case -4:
return SUB_RN_IMM;
case -5:
return MUL_RN_IMM;
case -6:
return DIV_RN_IMM;
case -7:
return MOD_RN_IMM;
case -8:
return AND_RN_IMM;
case -9:
return OR_RN_IMM;
case -10:
return XOR_RN_IMM;
default:
return -1;
}
}
// Reads a single line from the source string.
const char *readLine(const char *source, char *buffer, size_t maxLen) {
size_t i = 0;
while (*source && *source != '\n' && i < maxLen - 1) {
buffer[i++] = *source++;
}
buffer[i] = '\0';
return (*source == '\n') ? source + 1 : source;
}
//
// The first pass scans the assembly source file to record all labels and their addresses.
// The address is simply the offset into the output machine code buffer.
// For this example, every instruction is assumed to have a fixed length (opcode plus operand bytes).
//
int firstPass(const char *source) {
char line[MAX_LINE_LENGTH];
int addr = 0;
const char *ptr = source;
while (*ptr) {
// Read a line from the source string
ptr = readLine(ptr, line, sizeof(line));
trim(line);
if (line[0] == '\0' || line[0] == ';' || line[0] == '#')
continue; // Skip empty or comment lines
char *colon = strchr(line, ':');
if (colon != NULL) {
*colon = '\0';
trim(line);
addLabel(line, addr);
char *rest = colon + 1;
trim(rest);
if (strlen(rest) == 0)
continue;
strcpy(line, rest);
}
// For simplicity, we assume each instruction (with its operands) takes a fixed number of bytes.
// Here we calculate the number of bytes by looking at the opcode mnemonic.
// (A more robust approach would have a table for instruction sizes.)
char mnemonic[32];
sscanf(line, "%31s", mnemonic);
int opcode = getOpcode(mnemonic);
if (opcode == -2) {
// MOV: two operands separated by comma
// e.g. MOV R1, 42
// We add 3 bytes: opcode, operand1, operand2.
addr += 3;
} else if (opcode == -3 || opcode == -4 || opcode == -5 || opcode == -6 ||
opcode == -7 || opcode == -8 || opcode == -9 || opcode == -10) {
// ALU instructions with two operands: 3 bytes.
addr += 3;
} else if (opcode == NEG_RN || opcode == SWAPN || opcode == NOT_RN) {
// One operand: 2 bytes.
addr += 2;
} else if (opcode == SWAP || opcode == CMP) {
// Two operands: 3 bytes.
addr += 3;
} else if (opcode == SHL_RN_IMM || opcode == SHR_RN_IMM ||
opcode == SAR_RN_IMM) {
addr += 3;
} else if (opcode == JMP || opcode == JE || opcode == JNE ||
opcode == JG || opcode == JL || opcode == JGE || opcode == JLE ||
opcode == CALL) {
// Jump or call: 2 bytes (opcode and one byte address/immediate).
addr += 2;
} else if (opcode == RET || opcode == PUSHF || opcode == POPF) {
addr += 1;
} else if (opcode == PUSH || opcode == POP) {
addr += 2;
} else {
// For other instructions, we assume 3 bytes.
addr += 3;
}
}
return addr;
}
//
// The second pass actually translates the assembly instructions to machine code.
// The machine code is written into the provided buffer. (It must be large enough.)
//
int secondPass(const char *source, uint8_t *code) {
char line[MAX_LINE_LENGTH];
int addr = 0;
const char *ptr = source;
while (*ptr) {
ptr = readLine(ptr, line, sizeof(line));
trim(line);
if (line[0] == '\0' || line[0] == ';' || line[0] == '#')
continue;
char *colon = strchr(line, ':');
if (colon != NULL) {
*colon = ' ';
}
if (strlen(line) == 0)
continue;
char *token = strtok(line, " ,");
if (!token)
continue;
char mnemonic[32];
strncpy(mnemonic, token, sizeof(mnemonic));
int opcode = getOpcode(mnemonic);
code[addr++] = opcode;
// Handle instructions that need operand disambiguation.
if (strcmp(mnemonic, "MOV") == 0) {
// Get first operand.
char *dest = strtok(NULL, " ,");
char *src = strtok(NULL, " ,");
if (!dest || !src) {
fprintf(stderr, "Error: MOV requires two operands.\n");
exit(1);
}
int opcode2 = resolveMOV(dest, src);
code[addr++] = opcode2;
// For the MOV instructions we decide that:
// - For MOV_RN_IMM: operand bytes: [register, immediate]
// - For MOV_RN_RM: operand bytes: [dest register, src register]
// - For MOV_RN_ADDR: operand bytes: [dest register, address]
// - For MOV_ADDR_RN: operand bytes: [address, register]
if (opcode2 == MOV_RN_IMM) {
int reg = parseRegister(dest);
uint8_t imm = parseImmediate(src);
code[addr++] = reg;
code[addr++] = imm;
} else if (opcode2 == MOV_RN_RM) {
int regDest = parseRegister(dest);
int regSrc = parseRegister(src);
code[addr++] = regDest;
code[addr++] = regSrc;
} else if (opcode2 == MOV_RN_ADDR) {
// src is memory reference like "[123]"
int regDest = parseRegister(dest);
// Remove the brackets.
char addrStr[32];
strncpy(addrStr, src + 1, strlen(src) - 2);
addrStr[strlen(src) - 2] = '\0';
uint8_t memAddr = parseImmediate(addrStr);
code[addr++] = regDest;
code[addr++] = memAddr;
} else if (opcode2 == MOV_ADDR_RN) {
// dest is a memory reference, src is a register.
// Remove brackets from dest.
char addrStr[32];
strncpy(addrStr, dest + 1, strlen(dest) - 2);
addrStr[strlen(dest) - 2] = '\0';
uint8_t memAddr = parseImmediate(addrStr);
int regSrc = parseRegister(src);
code[addr++] = memAddr;
code[addr++] = regSrc;
}
} else if (strcmp(mnemonic, "ADD") == 0 ||
strcmp(mnemonic, "SUB") == 0 ||
strcmp(mnemonic, "MUL") == 0 ||
strcmp(mnemonic, "DIV") == 0 ||
strcmp(mnemonic, "MOD") == 0 ||
strcmp(mnemonic, "AND") == 0 ||
strcmp(mnemonic, "OR") == 0 ||
strcmp(mnemonic, "XOR") == 0) {
// ALU instructions with two operands.
char *dest = strtok(NULL, " ,");
char *src = strtok(NULL, " ,");
if (!dest || !src) {
fprintf(stderr, "Error: %s requires two operands.\n", mnemonic);
exit(1);
}
int baseOpcode;
if (strcmp(mnemonic, "ADD") == 0) baseOpcode = -3;
else if (strcmp(mnemonic, "SUB") == 0) baseOpcode = -4;
else if (strcmp(mnemonic, "MUL") == 0) baseOpcode = -5;
else if (strcmp(mnemonic, "DIV") == 0) baseOpcode = -6;
else if (strcmp(mnemonic, "MOD") == 0) baseOpcode = -7;
else if (strcmp(mnemonic, "AND") == 0) baseOpcode = -8;
else if (strcmp(mnemonic, "OR") == 0) baseOpcode = -9;
else if (strcmp(mnemonic, "XOR") == 0) baseOpcode = -10;
else baseOpcode = -1;
int opcode3 = resolveALU(baseOpcode, src);
code[addr++] = opcode3;
int regDest = parseRegister(dest);
code[addr++] = regDest;
// For a register source, encode the register; for an immediate, encode the value.
if (src[0] == 'R' || src[0] == 'r') {
int regSrc = parseRegister(src);
code[addr++] = regSrc;
} else {
uint8_t imm = parseImmediate(src);
code[addr++] = imm;
}
} else if (strcmp(mnemonic, "NEG") == 0 ||
strcmp(mnemonic, "SWAPN") == 0 ||
strcmp(mnemonic, "NOT") == 0) {
// One operand instructions.
char *op = strtok(NULL, " ,");
if (!op) {
fprintf(stderr, "Error: %s requires one operand.\n", mnemonic);
exit(1);
}
int opcode4 = getOpcode(mnemonic);
code[addr++] = opcode4;
int reg = parseRegister(op);
code[addr++] = reg;
} else if (strcmp(mnemonic, "SWAP") == 0 || strcmp(mnemonic, "CMP") == 0) {
// Two operand instructions: both registers.
char *op1 = strtok(NULL, " ,");
char *op2 = strtok(NULL, " ,");
if (!op1 || !op2) {
fprintf(stderr, "Error: %s requires two operands.\n", mnemonic);
exit(1);
}
int opcode5 = getOpcode(mnemonic);
code[addr++] = opcode5;
int r1 = parseRegister(op1);
int r2 = parseRegister(op2);
code[addr++] = r1;
code[addr++] = r2;
} else if (strcmp(mnemonic, "SHL") == 0 ||
strcmp(mnemonic, "SHR") == 0 ||
strcmp(mnemonic, "SAR") == 0 ||
strcmp(mnemonic, "SHRS") == 0) {
// Shift instructions: one register operand and one immediate.
char *regToken = strtok(NULL, " ,");
char *immToken = strtok(NULL, " ,");
if (!regToken || !immToken) {
fprintf(stderr, "Error: %s requires two operands.\n", mnemonic);
exit(1);
}
int opcode6 = getOpcode(mnemonic);
code[addr++] = opcode6;
int reg = parseRegister(regToken);
code[addr++] = reg;
uint8_t imm = parseImmediate(immToken);
code[addr++] = imm;
} else if (strcmp(mnemonic, "JMP") == 0 ||
strcmp(mnemonic, "JE") == 0 ||
strcmp(mnemonic, "JNE") == 0 ||
strcmp(mnemonic, "JG") == 0 ||
strcmp(mnemonic, "JL") == 0 ||
strcmp(mnemonic, "JGE") == 0 ||
strcmp(mnemonic, "JLE") == 0 ||
strcmp(mnemonic, "CALL") == 0) {
// Jump instructions: one operand which may be a label or an immediate address.
char *operand = strtok(NULL, " ,");
if (!operand) {
fprintf(stderr, "Error: %s requires an operand.\n", mnemonic);
exit(1);
}
int opcode7 = getOpcode(mnemonic);
code[addr++] = opcode7;
// If the operand is not a number, assume it is a label.
if (!isdigit(operand[0])) {
int labelAddr = lookupLabel(operand);
if (labelAddr < 0) {
fprintf(stderr, "Error: undefined label '%s'\n", operand);
exit(1);
}
code[addr++] = (uint8_t) labelAddr;
} else {
uint8_t imm = parseImmediate(operand);
code[addr++] = imm;
}
} else if (strcmp(mnemonic, "RET") == 0 ||
strcmp(mnemonic, "PUSHF") == 0 ||
strcmp(mnemonic, "POPF") == 0) {
// Instructions with no operand.
int opcode8 = getOpcode(mnemonic);
code[addr++] = opcode8;
} else if (strcmp(mnemonic, "PUSH") == 0 ||
strcmp(mnemonic, "POP") == 0) {
// One operand (a register)
char *regToken = strtok(NULL, " ,");
if (!regToken) {
fprintf(stderr, "Error: %s requires a register operand.\n", mnemonic);
exit(1);
}
int opcode9 = getOpcode(mnemonic);
code[addr++] = opcode9;
int reg = parseRegister(regToken);
code[addr++] = reg;
} else {
fprintf(stderr, "Error: Unknown instruction '%s'\n", mnemonic);
exit(1);
}
}
return addr;
}
void completePass(const char *input, CPU *cpu) {
// First pass: determine label addresses.
firstPass(input);
memset(cpu->memory, 0, MEM_SIZE);
// Second pass: generate machine code.
secondPass(input, cpu->memory);
}