Files
RISC-B/assembler/assembler.c
2025-02-22 16:15:05 +01:00

660 lines
24 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//
// Created by bruno on 1.2.2025.
//
#include "assembler.h"
Label labels[MAX_LABELS];
int labelCount = 0;
//
// Helper functions for string manipulation
//
void trim(char *s) {
// Remove leading whitespace
while (isspace((unsigned char) *s)) s++;
// Remove trailing whitespace
char *end = s + strlen(s) - 1;
while (end > s && isspace((unsigned char) *end)) {
*end = '\0';
end--;
}
}
// Look up a label by name; returns -1 if not found.
int lookupLabel(const char *name) {
for (int i = 0; i < labelCount; i++) {
if (strcmp(labels[i].name, name) == 0)
return labels[i].address;
}
return -1;
}
// Add a label to the table
int addLabel(const char *name, uint32_t address) {
if (labelCount >= MAX_LABELS) {
fprintf(stderr, "Too many labels!\n");
return 1;
}
strncpy(labels[labelCount].name, name, sizeof(labels[labelCount].name));
labels[labelCount].address = address;
labelCount++;
}
//
// Parse a register string (e.g., "R0", "R1", etc.) and return it's number.
// Returns -1 on error.
int parseRegister(const char *token) {
if (token[0] == 'R' || token[0] == 'r') {
int reg = atoi(token + 1);
if (reg >= 0 && reg < REG_COUNT)
return reg;
}
return -1;
}
// Parse an immediate value (supports decimal and 0x... hexadecimal)
uint8_t parseImmediate(const char *token) {
int16_t value = 0; // Temporary variable as signed int16_t
// Check if the value starts with '0x' or '0X' for hexadecimal
if (strlen(token) > 2 && token[0] == '0' && (token[1] == 'x' || token[1] == 'X')) {
// Handle hexadecimal: Check for signed hex (e.g. -0x1F4, +0x1F4)
if (token[2] == '+' || token[2] == '-') {
sscanf(token, "%hx", &value); // Hexadecimal signed (same as unsigned in sscanf)
if (token[2] == '-') value = -value; // Adjust sign if negative
} else {
// Hexadecimal unsigned value
sscanf(token, "%hx", &value);
}
} else {
// Check if the value has a signed prefix (+ or -) for decimal
if (token[0] == '+' || token[0] == '-') {
sscanf(token, "%hd", &value); // Signed decimal
} else {
// Unsigned decimal value
unsigned int unsigned_value;
sscanf(token, "%u", &unsigned_value);
value = (int16_t) unsigned_value; // Cast unsigned to signed
}
}
// Convert signed 16-bit value to unsigned 8-bit value
// Ensure the value fits within the range of uint8_t (0 to 255)
return (uint8_t) (value & 0xFF); // Mask with 0xFF to discard upper bits
}
void toUpperCase(char *string) {
while (*string) {
if (*string > 0x60 && *string < 0x7b) {
(*string) -= 0x20;
}
if (*string == '\r') {
*string = ' ';
}
string++;
}
}
void toLowerCase(char *string) {
while (*string) {
if (*string >= 'A' && *string <= 'Z') {
(*string) += 0x20;
}
string++;
}
}
//
// Map an instruction mnemonic (string) to its opcode value and expected operand types.
// For simplicity, we will return the opcode value and then in our parser well decide how many operands to expect.
// (In a full assembler you might use a more sophisticated data structure.)
//
int getOpcode(char *mnemonic) {
toUpperCase(mnemonic);
if (strcmp(mnemonic, "NOP") == 0)
return NOP;
else if (strcmp(mnemonic, "BRK") == 0)
return BRK;
else if (strcmp(mnemonic, "HLT") == 0)
return HLT;
else if (strcmp(mnemonic, "MOV") == 0)
return -2; // Special case: we must decide between MOV_IMM_RN, MOV_RN_RM, MOV_RN_ADDR, MOV_ADDR_RN
else if (strcmp(mnemonic, "SWAP") == 0)
return SWAP;
else if (strcmp(mnemonic, "SWAPN") == 0)
return SWAPN;
else if (strcmp(mnemonic, "ADD") == 0)
return -3; // Special: decide between ADD_RN_RM and ADD_RN_IMM
else if (strcmp(mnemonic, "SUB") == 0)
return -4; // Special: decide between SUB_RN_RM and SUB_RN_IMM
else if (strcmp(mnemonic, "MUL") == 0)
return -5; // Special: decide between MUL_RN_RM and MUL_RN_IMM
else if (strcmp(mnemonic, "DIV") == 0)
return -6; // Special: decide between DIV_RN_RM and DIV_RN_IMM
else if (strcmp(mnemonic, "MOD") == 0)
return -7; // Special: decide between MOD_RN_RM and MOD_RN_IMM
else if (strcmp(mnemonic, "NEG") == 0)
return NEG_RN;
else if (strcmp(mnemonic, "AND") == 0)
return -8; // Special: decide between AND_RN_RM and AND_RN_IMM
else if (strcmp(mnemonic, "OR") == 0)
return -9; // Special: decide between OR_RN_RM and OR_RN_IMM
else if (strcmp(mnemonic, "XOR") == 0)
return -10; // Special: decide between XOR_RN_RM and XOR_RN_IMM
else if (strcmp(mnemonic, "NOT") == 0)
return NOT_RN;
else if (strcmp(mnemonic, "SHL") == 0)
return SHL_RN_IMM;
else if (strcmp(mnemonic, "SHR") == 0)
return SHR_RN_IMM;
else if (strcmp(mnemonic, "SAR") == 0)
return SAR_RN_IMM;
else if (strcmp(mnemonic, "JMP") == 0)
return -11; //Special: if + or - present choose JMP_REL, otherwise JMP
else if (strcmp(mnemonic, "INC") == 0)
return -12; //Special: decide between INC_RN and INC_ADDR
else if (strcmp(mnemonic, "DEC") == 0)
return -13; //Special: decide between DEC_RN and DEC_ADDR
else if (strcmp(mnemonic, "CMP") == 0)
return CMP;
else if (strcmp(mnemonic, "JE") == 0)
return JE;
else if (strcmp(mnemonic, "JNE") == 0)
return JNE;
else if (strcmp(mnemonic, "JMPBS") == 0)
return -14; //Special: decide between BIT_TS_RN and BIT_TS_ADDR
else if (strcmp(mnemonic, "JMPBC") == 0)
return -15; //Special: decide between BIT_TC_RN and BIT_TC_ADDR
else if (strcmp(mnemonic, "BITS") == 0)
return -16; //Special: decide between BITS_RN and BITS_ADDR
else if (strcmp(mnemonic, "BITC") == 0)
return -17; //Special: decide between BITC_RN and BITC_ADDR
else if (strcmp(mnemonic, "JG") == 0)
return JG;
else if (strcmp(mnemonic, "JL") == 0)
return JL;
else if (strcmp(mnemonic, "JGE") == 0)
return JGE;
else if (strcmp(mnemonic, "JLE") == 0)
return JLE;
else if (strcmp(mnemonic, "CALL") == 0)
return CALL;
else if (strcmp(mnemonic, "RET") == 0)
return RET;
else {
return -1;
}
}
//
// In this simple assembler, some instructions share a mnemonic, and we must choose the correct opcode
// based on the type of the operand (register vs. immediate vs. memory).
// The following helper functions decide that, given two operands (as strings).
//
// For example, "MOV Rn, 42" should choose MOV_IMM_RN, while "MOV Rn, Rm" should choose MOV_RN_RM.
// We assume that memory addresses are written in square brackets, e.g. "[123]".
//
int resolveMOV(const char *dest, const char *src) {
// If dest starts with '[' then it is a memory destination.
if (dest[0] == '[') return MOV_RN_ADDR; // actually, MOV [Addr], Rn expects Rn in second operand
// Otherwise, dest is a register.
// Now, check src:
if ((dest[0] == 'R' || dest[0] == 'r') && (src[0] == 'R' || src[0] == 'r')) {
return MOV_RN_RM;
} else if (src[0] == '[') {
return MOV_ADDR_RN;
} else {
return MOV_IMM_RN;
}
}
int resolveALU(int baseOpcode, const char *src, const char *dest) {
// baseOpcode is one of our special negative values for ADD, SUB, etc.
if (dest[0] == 'R' || dest[0] == 'r') {
switch (baseOpcode) {
case -3:
return ADD_RN_RM;
case -4:
return SUB_RN_RM;
case -5:
return MUL_RN_RM;
case -6:
return DIV_RN_RM;
case -7:
return MOD_RN_RM;
case -8:
return AND_RN_RM;
case -9:
return OR_RN_RM;
case -10:
return XOR_RN_RM;
default:
return -1;
}
} else if (src[0] == 'r' || src[0] == 'R' && baseOpcode < -11) {
switch (baseOpcode) {
case -12:
return INC_RN;
case -13:
return DEC_RN;
case -14:
return BIT_TS_RN;
case -15:
return BIT_TC_RN;
case -16:
return BITS_RN;
case -17:
return BITC_RN;
default:
return -1;
}
} else if (src[0] == '+' || src[0] == '-') {
switch (baseOpcode) {
case -11:
return JMP_REL;
default:
return -1;
}
} else {
switch (baseOpcode) {
case -3:
return ADD_RN_IMM;
case -4:
return SUB_RN_IMM;
case -5:
return MUL_RN_IMM;
case -6:
return DIV_RN_IMM;
case -7:
return MOD_RN_IMM;
case -8:
return AND_RN_IMM;
case -9:
return OR_RN_IMM;
case -10:
return XOR_RN_IMM;
case -11:
return JMP;
case -12:
return INC_ADDR;
case -13:
return DEC_ADDR;
case -14:
return BIT_TS_ADDR;
case -15:
return BIT_TC_ADDR;
case -16:
return BITS_ADDR;
case -17:
return BITC_ADDR;
default:
return -1;
}
}
}
// Reads a single line from the source string.
const char *readLine(const char *source, char *buffer, size_t maxLen) {
size_t i = 0;
while (*source && *source != '\n' && i < maxLen - 1) {
buffer[i++] = *source++;
}
buffer[i] = '\0';
return (*source == '\n') ? source + 1 : source;
}
//
// The first pass scans the assembly source file to record all labels and their addresses.
// The address is simply the offset into the output machine code buffer.
// For this example, every instruction is assumed to have a fixed length (opcode plus operand bytes).
//
void firstPass(const char *source) {
char line[MAX_LINE_LENGTH];
int addr = 0;
labelCount = 0;
const char *ptr = source;
while (*ptr) {
ptr = readLine(ptr, line, sizeof(line));
trim(line);
// Skip blank lines or comments.
if (line[0] == '\0' || line[0] == ';' || line[0] == '#')
continue;
// Process labels.
char *colon = strchr(line, ':');
if (colon != NULL) {
*colon = '\0';
trim(line);
addLabel(line, addr);
char *rest = colon + 1;
trim(rest);
if (strlen(rest) == 0)
continue;
strcpy(line, rest);
}
// Parse the mnemonic and operands.
char mnemonic[32], operand1[64], operand2[64], operand3[64];
operand1[0] = '\0';
operand2[0] = '\0';
sscanf(line, "%31s %63[^ ] %63[^ ] %63s",
mnemonic, operand1, operand2, operand3);
// Use the mapper to get a base opcode.
int baseOpcode = getOpcode(mnemonic);
if (baseOpcode == -1) {
printf("Unknown instruction: %s\n", mnemonic);
continue;
}
addr += CPU_INSTRUCTION_SIZE;
}
}
//
// The second pass actually translates the assembly instructions to machine code.
// The machine code is written into the provided buffer. (It must be large enough.)
//
uint32_t completePass(const char *source, CPU *cpu, bool erase) {
if (erase) {
memset(cpu->memory, 0, sizeof(cpu->memory));
memset(cpu->regs, 0, sizeof(cpu->regs));
memset(cpu->stack, 0, sizeof(cpu->stack));
memset(cpu->addrToLineMapper, 0, sizeof(cpu->addrToLineMapper));
cpu->pc = 0;
cpu->stack_ptr = 0;
cpu->flags = 0;
cpu->cycle = 0;
}
// First pass: determine label addresses.
firstPass(source);
char line[MAX_LINE_LENGTH];
uint32_t addr = 0;
const char *ptr = source;
uint32_t lineIndex = 0;
while (*ptr) {
ptr = readLine(ptr, line, sizeof(line));
trim(line);
if (line[0] == '\0' || line[0] == ';' || line[0] == '#') {
lineIndex++;
continue;
}
// Stop at the first semicolon.
char *semicolon = strchr(line, ';');
if (semicolon != NULL) {
*semicolon = '\0'; // Terminate the string at the semicolon
trim(line); // Trim again in case spaces remain
if (line[0] == '\0') {
lineIndex++;
continue;
}
}
// Remove any label definitions (up to the colon).
char *colon = strchr(line, ':');
if (colon != NULL) {
lineIndex++;
continue;
}
if (strlen(line) == 0)
continue;
// Parse mnemonic and up to three operands.
char mnemonic[32], operand1[64], operand2[64], operand3[64];
mnemonic[0] = operand1[0] = operand2[0] = operand3[0] = '\0';
sscanf(line, "%31s %63[^ ] %63[^ ] %63s",
mnemonic, operand1, operand2, operand3);
// (Optionally, you might trim each operand individually here.)
uint32_t oldAddr = addr;
// Map the mnemonic to a base opcode.
int baseOpcode = getOpcode(mnemonic);
if (baseOpcode == -1) {
fprintf(stderr, "Unknown instruction: %s\n", mnemonic);
return 1;
}
// --- MOV Instruction (baseOpcode == -2) ---
if (baseOpcode == -2) {
if (strlen(operand1) == 0 || strlen(operand2) == 0) {
fprintf(stderr, "Error: MOV requires two operands.\n");
return 1;
}
int resolvedOpcode = resolveMOV(operand2, operand1);
cpu->memory[addr++] = resolvedOpcode;
if (resolvedOpcode == MOV_IMM_RN) {
uint8_t imm = parseImmediate(operand1);
int reg = parseRegister(operand2);
cpu->memory[addr++] = imm;
cpu->memory[addr++] = reg;
} else if (resolvedOpcode == MOV_RN_RM) {
int regSrc = parseRegister(operand1);
int regDest = parseRegister(operand2);
cpu->memory[addr++] = regSrc;
cpu->memory[addr++] = regDest;
} else if (resolvedOpcode == MOV_ADDR_RN) {
// Assume source is written as "[address]": remove the brackets.
char addrStr[32];
strncpy(addrStr, operand1 + 1, strlen(operand1) - 2);
addrStr[strlen(operand1) - 2] = '\0';
uint32_t memAddr = (uint32_t) strtoul(addrStr, NULL, 0);
int reg = parseRegister(operand2);
cpu->memory[addr++] = memAddr & 0xFF;
cpu->memory[addr++] = (memAddr >> 8) & 0xFF;
cpu->memory[addr++] = (memAddr >> 16) & 0xFF;
cpu->memory[addr++] = reg;
} else if (resolvedOpcode == MOV_RN_ADDR) {
// Destination is memory (written as "[address]").
char addrStr[32];
strncpy(addrStr, operand2 + 1, strlen(operand2) - 2);
addrStr[strlen(operand2) - 2] = '\0';
int reg = parseRegister(operand1);
uint32_t memAddr = (uint32_t) strtoul(addrStr, NULL, 0);
cpu->memory[addr++] = reg;
cpu->memory[addr++] = memAddr & 0xFF;
cpu->memory[addr++] = (memAddr >> 8) & 0xFF;
cpu->memory[addr++] = (memAddr >> 16) & 0xFF;
}
}
// --- INC and DEC (baseOpcode == -12 or -13) ---
// These instructions require only a single operand.
else if (baseOpcode == -12 || baseOpcode == -13) {
if (strlen(operand1) == 0) {
fprintf(stderr, "Error: %s requires one operand.\n", mnemonic);
return 1;
}
int resolvedOpcode = resolveALU(baseOpcode, operand1, operand2);
cpu->memory[addr++] = resolvedOpcode;
if (operand1[0] == 'R' || operand1[0] == 'r') {
int reg = parseRegister(operand1);
cpu->memory[addr++] = reg;
} else {
// Assume memory reference written as "[address]".
char addrStr[32];
strncpy(addrStr, operand1 + 1, strlen(operand1) - 2);
addrStr[strlen(operand1) - 2] = '\0';
uint32_t memAddr = (uint32_t) strtoul(addrStr, NULL, 0);
cpu->memory[addr++] = memAddr & 0xFF;
cpu->memory[addr++] = (memAddr >> 8) & 0xFF;
cpu->memory[addr++] = (memAddr >> 16) & 0xFF;
}
}
// --- Other Ambiguous ALU Instructions (ADD, SUB, MUL, etc.) ---
// These require two operands (destination and source).
else if (baseOpcode < 0 && baseOpcode != -2 && baseOpcode != -11 &&
baseOpcode != -14 && baseOpcode != -15 && baseOpcode != -12 && baseOpcode != -13) {
if (strlen(operand1) == 0 || strlen(operand2) == 0) {
fprintf(stderr, "Error: %s requires two operands.\n", mnemonic);
return 1;
}
int resolvedOpcode = resolveALU(baseOpcode, operand1, operand2);
cpu->memory[addr++] = resolvedOpcode;
int regDest = parseRegister(operand1);
cpu->memory[addr++] = regDest;
if (operand2[0] == 'R' || operand2[0] == 'r') {
int regSrc = parseRegister(operand2);
cpu->memory[addr++] = regSrc;
} else {
uint8_t imm = parseImmediate(operand2);
cpu->memory[addr++] = imm;
}
}
// --- JMP Instruction (baseOpcode == -11) ---
else if (baseOpcode == -11) {
if (strlen(operand1) == 0) {
fprintf(stderr, "Error: JMP requires one operand.\n");
return 1;
}
int resolvedOpcode = resolveALU(baseOpcode, operand1, operand2);
cpu->memory[addr++] = resolvedOpcode;
if (operand1[0] == '+' || operand1[0] == '-') {
// Relative jump: one-byte offset.
uint8_t offset = parseImmediate(operand1);
cpu->memory[addr++] = offset;
} else {
// Absolute jump: use label lookup for 32-bit address.
uint32_t jumpAddr = (uint32_t) lookupLabel(operand1);
cpu->memory[addr++] = jumpAddr & 0xFF;
cpu->memory[addr++] = (jumpAddr >> 8) & 0xFF;
cpu->memory[addr++] = (jumpAddr >> 16) & 0xFF;
}
}
// --- Jump Bit Set/Clear Instructions (JMPBS, JMPBC) ---
else if (baseOpcode == -14 || baseOpcode == -15) {
if (strlen(operand1) == 0 || strlen(operand2) == 0 || strlen(operand3) == 0) {
fprintf(stderr, "Error: %s requires three operands.\n", mnemonic);
return 1;
}
int resolvedOpcode = resolveALU(baseOpcode, operand1, operand2);
cpu->memory[addr++] = resolvedOpcode;
// Encode the source operand (register or memory).
if (operand1[0] == 'R' || operand1[0] == 'r') {
int reg = parseRegister(operand1);
cpu->memory[addr++] = reg;
} else {
char addrStr[32];
strncpy(addrStr, operand1 + 1, strlen(operand1) - 2);
addrStr[strlen(operand1) - 2] = '\0';
uint32_t memAddr = (uint32_t) strtoul(addrStr, NULL, 0);
cpu->memory[addr++] = memAddr & 0xFF;
cpu->memory[addr++] = (memAddr >> 8) & 0xFF;
cpu->memory[addr++] = (memAddr >> 16) & 0xFF;
}
// Encode the bit number (a one-byte immediate).
uint8_t bitVal = parseImmediate(operand2);
cpu->memory[addr++] = bitVal;
// Encode the jump target (label -> 32-bit address).
uint32_t jumpAddr = (uint32_t) lookupLabel(operand3);
cpu->memory[addr++] = jumpAddr & 0xFF;
cpu->memory[addr++] = (jumpAddr >> 8) & 0xFF;
cpu->memory[addr++] = (jumpAddr >> 16) & 0xFF;
}
// --- Non-ambiguous Instructions ---
else if (baseOpcode >= 0) {
switch (baseOpcode) {
case CMP:
case SWAP: {
if (strlen(operand1) == 0 || strlen(operand2) == 0) {
fprintf(stderr, "Error: %s requires two operands.\n", mnemonic);
return 1;
}
cpu->memory[addr++] = baseOpcode;
int r1 = parseRegister(operand1);
int r2 = parseRegister(operand2);
cpu->memory[addr++] = r1;
cpu->memory[addr++] = r2;
break;
}
case SWAPN:
case NEG_RN:
case NOT_RN: {
if (strlen(operand1) == 0) {
fprintf(stderr, "Error: %s requires one operand.\n", mnemonic);
return 1;
}
cpu->memory[addr++] = baseOpcode;
int reg = parseRegister(operand1);
cpu->memory[addr++] = reg;
break;
}
case SHL_RN_IMM:
case SHR_RN_IMM:
case SAR_RN_IMM: {
if (strlen(operand1) == 0 || strlen(operand2) == 0) {
fprintf(stderr, "Error: %s requires two operands.\n", mnemonic);
return 1;
}
cpu->memory[addr++] = baseOpcode;
int reg = parseRegister(operand1);
cpu->memory[addr++] = reg;
uint8_t imm = parseImmediate(operand2);
cpu->memory[addr++] = imm;
break;
}
case JE:
case JNE:
case JG:
case JL:
case JGE:
case JLE:
case CALL: {
if (strlen(operand1) == 0) {
fprintf(stderr, "Error: %s requires one operand.\n", mnemonic);
return 1;
}
cpu->memory[addr++] = baseOpcode;
// If the operand isnt purely numeric, treat it as a label.
if (!isdigit(operand1[0])) {
int labelAddr = lookupLabel(operand1);
if (labelAddr < 0) {
fprintf(stderr, "Error: undefined label '%s'\n", operand1);
return 1;
}
cpu->memory[addr++] = labelAddr & 0xFF;
cpu->memory[addr++] = (labelAddr >> 8) & 0xFF;
cpu->memory[addr++] = (labelAddr >> 16) & 0xFF;
} else {
uint32_t immAddr = (uint32_t) strtoul(operand1, NULL, 0);
cpu->memory[addr++] = immAddr & 0xFF;
cpu->memory[addr++] = (immAddr >> 8) & 0xFF;
cpu->memory[addr++] = (immAddr >> 16) & 0xFF;
}
break;
}
case RET:
case BRK:
case HLT:
case NOP: {
cpu->memory[addr++] = baseOpcode;
break;
}
default: {
fprintf(stderr, "Error: Unhandled opcode %d\n", baseOpcode);
return 1;
}
}
} else {
fprintf(stderr, "Error: Unknown instruction '%s'\n", mnemonic);
return 1;
}
const uint32_t remainingBytes = CPU_INSTRUCTION_SIZE - (addr - oldAddr);
if (remainingBytes > CPU_INSTRUCTION_SIZE) {
printf("HELP, INSTRUCTION SIZE SMALLER THAN INSTRUCTION\n");
}
cpu->addrToLineMapper[(addr - (addr % CPU_INSTRUCTION_SIZE)) / CPU_INSTRUCTION_SIZE] = lineIndex;
addr += remainingBytes;
lineIndex++;
}
cpu->programEnd = addr;
return addr;
}