// // Created by bruno on 1.2.2025. // #include "assembler.h" Label labels[MAX_LABELS]; int labelCount = 0; // // Helper functions for string manipulation // void trim(char *s) { // Remove leading whitespace while (isspace((unsigned char) *s)) s++; // Remove trailing whitespace char *end = s + strlen(s) - 1; while (end > s && isspace((unsigned char) *end)) { *end = '\0'; end--; } } // Look up a label by name; returns -1 if not found. int lookupLabel(const char *name) { for (int i = 0; i < labelCount; i++) { if (strcmp(labels[i].name, name) == 0) return labels[i].address; } return -1; } // Add a label to the table void addLabel(const char *name, int address) { if (labelCount >= MAX_LABELS) { fprintf(stderr, "Too many labels!\n"); exit(1); } strncpy(labels[labelCount].name, name, sizeof(labels[labelCount].name)); labels[labelCount].address = address; labelCount++; } // // Parse a register string (e.g., "R0", "R1", etc.) and return it's number. // Returns -1 on error. int parseRegister(const char *token) { if (token[0] == 'R' || token[0] == 'r') { int reg = atoi(token + 1); if (reg >= 0 && reg < REG_COUNT) return reg; } return -1; } // Parse an immediate value (supports decimal and 0x... hexadecimal) uint8_t parseImmediate(const char *token) { int value; if (strlen(token) > 2 && token[0] == '0' && (token[1] == 'x' || token[1] == 'X')) sscanf(token, "%x", &value); else sscanf(token, "%d", &value); return (uint8_t) value; } void toUpperCase(char *string) { while (*string) { if (*string > 0x60 && *string < 0x7b) { (*string) -= 0x20; } } } // // Map an instruction mnemonic (string) to its opcode value and expected operand types. // For simplicity, we will return the opcode value and then in our parser we’ll decide how many operands to expect. // (In a full assembler you might use a more sophisticated data structure.) // int getOpcode(char *mnemonic) { toUpperCase(mnemonic); if (strcmp(mnemonic, "BRK") == 0) return BRK; else if (strcmp(mnemonic, "NOP") == 0) return NOP; else if (strcmp(mnemonic, "MOV") == 0) return -2; // Special case: we must decide between MOV_RN_IMM, MOV_RN_RM, MOV_RN_ADDR, MOV_ADDR_RN else if (strcmp(mnemonic, "SWAP") == 0) return SWAP; else if (strcmp(mnemonic, "SWAPN") == 0) return SWAPN; else if (strcmp(mnemonic, "ADD") == 0) return -3; // Special: decide between ADD_RN_RM and ADD_RN_IMM else if (strcmp(mnemonic, "SUB") == 0) return -4; // Special: decide between SUB_RN_RM and SUB_RN_IMM else if (strcmp(mnemonic, "MUL") == 0) return -5; // Special: decide between MUL_RN_RM and MUL_RN_IMM else if (strcmp(mnemonic, "DIV") == 0) return -6; // Special: decide between DIV_RN_RM and DIV_RN_IMM else if (strcmp(mnemonic, "MOD") == 0) return -7; // Special: decide between MOD_RN_RM and MOD_RN_IMM else if (strcmp(mnemonic, "NEG") == 0) return NEG_RN; else if (strcmp(mnemonic, "AND") == 0) return -8; // Special: decide between AND_RN_RM and AND_RN_IMM else if (strcmp(mnemonic, "OR") == 0) return -9; // Special: decide between OR_RN_RM and OR_RN_IMM else if (strcmp(mnemonic, "XOR") == 0) return -10; // Special: decide between XOR_RN_RM and XOR_RN_IMM else if (strcmp(mnemonic, "NOT") == 0) return NOT_RN; else if (strcmp(mnemonic, "SHL") == 0) return SHL_RN_IMM; else if (strcmp(mnemonic, "SHR") == 0) return SHR_RN_IMM; else if (strcmp(mnemonic, "SAR") == 0) return SAR_RN_IMM; else if (strcmp(mnemonic, "JMP") == 0) return JMP; else if (strcmp(mnemonic, "CMP") == 0) return CMP; else if (strcmp(mnemonic, "JE") == 0) return JE; else if (strcmp(mnemonic, "JNE") == 0) return JNE; else if (strcmp(mnemonic, "JG") == 0) return JG; else if (strcmp(mnemonic, "JL") == 0) return JL; else if (strcmp(mnemonic, "JGE") == 0) return JGE; else if (strcmp(mnemonic, "JLE") == 0) return JLE; else if (strcmp(mnemonic, "CALL") == 0) return CALL; else if (strcmp(mnemonic, "RET") == 0) return RET; else if (strcmp(mnemonic, "PUSH") == 0) return PUSH; else if (strcmp(mnemonic, "POP") == 0) return POP; else if (strcmp(mnemonic, "PUSHF") == 0) return PUSHF; else if (strcmp(mnemonic, "POPF") == 0) return POPF; else { return -1; } } // // In this simple assembler, some instructions share a mnemonic, and we must choose the correct opcode // based on the type of the operand (register vs. immediate vs. memory). // The following helper functions decide that, given two operands (as strings). // // For example, "MOV Rn, 42" should choose MOV_RN_IMM, while "MOV Rn, Rm" should choose MOV_RN_RM. // We assume that memory addresses are written in square brackets, e.g. "[123]". // int resolveMOV(const char *dest, const char *src) { // If dest starts with '[' then it is a memory destination. if (dest[0] == '[') return MOV_ADDR_RN; // actually, MOV [Addr], Rn expects Rn in second operand // Otherwise, dest is a register. // Now, check src: if (src[0] == 'R' || src[0] == 'r') { return MOV_RN_RM; } else if (src[0] == '[') { return MOV_RN_ADDR; } else { return MOV_RN_IMM; } } int resolveALU(int baseOpcode, const char *src) { // baseOpcode is one of our special negative values for ADD, SUB, etc. if (src[0] == 'R' || src[0] == 'r') switch (baseOpcode) { case -3: return ADD_RN_RM; case -4: return SUB_RN_RM; case -5: return MUL_RN_RM; case -6: return DIV_RN_RM; case -7: return MOD_RN_RM; case -8: return AND_RN_RM; case -9: return OR_RN_RM; case -10: return XOR_RN_RM; default: return -1; } else switch (baseOpcode) { case -3: return ADD_RN_IMM; case -4: return SUB_RN_IMM; case -5: return MUL_RN_IMM; case -6: return DIV_RN_IMM; case -7: return MOD_RN_IMM; case -8: return AND_RN_IMM; case -9: return OR_RN_IMM; case -10: return XOR_RN_IMM; default: return -1; } } // Reads a single line from the source string. const char *readLine(const char *source, char *buffer, size_t maxLen) { size_t i = 0; while (*source && *source != '\n' && i < maxLen - 1) { buffer[i++] = *source++; } buffer[i] = '\0'; return (*source == '\n') ? source + 1 : source; } // // The first pass scans the assembly source file to record all labels and their addresses. // The address is simply the offset into the output machine code buffer. // For this example, every instruction is assumed to have a fixed length (opcode plus operand bytes). // int firstPass(const char *source) { char line[MAX_LINE_LENGTH]; int addr = 0; const char *ptr = source; while (*ptr) { // Read a line from the source string ptr = readLine(ptr, line, sizeof(line)); trim(line); if (line[0] == '\0' || line[0] == ';' || line[0] == '#') continue; // Skip empty or comment lines char *colon = strchr(line, ':'); if (colon != NULL) { *colon = '\0'; trim(line); addLabel(line, addr); char *rest = colon + 1; trim(rest); if (strlen(rest) == 0) continue; strcpy(line, rest); } // For simplicity, we assume each instruction (with its operands) takes a fixed number of bytes. // Here we calculate the number of bytes by looking at the opcode mnemonic. // (A more robust approach would have a table for instruction sizes.) char mnemonic[32]; sscanf(line, "%31s", mnemonic); int opcode = getOpcode(mnemonic); if (opcode == -2) { // MOV: two operands separated by comma // e.g. MOV R1, 42 // We add 3 bytes: opcode, operand1, operand2. addr += 3; } else if (opcode == -3 || opcode == -4 || opcode == -5 || opcode == -6 || opcode == -7 || opcode == -8 || opcode == -9 || opcode == -10) { // ALU instructions with two operands: 3 bytes. addr += 3; } else if (opcode == NEG_RN || opcode == SWAPN || opcode == NOT_RN) { // One operand: 2 bytes. addr += 2; } else if (opcode == SWAP || opcode == CMP) { // Two operands: 3 bytes. addr += 3; } else if (opcode == SHL_RN_IMM || opcode == SHR_RN_IMM || opcode == SAR_RN_IMM) { addr += 3; } else if (opcode == JMP || opcode == JE || opcode == JNE || opcode == JG || opcode == JL || opcode == JGE || opcode == JLE || opcode == CALL) { // Jump or call: 2 bytes (opcode and one byte address/immediate). addr += 2; } else if (opcode == RET || opcode == PUSHF || opcode == POPF) { addr += 1; } else if (opcode == PUSH || opcode == POP) { addr += 2; } else { // For other instructions, we assume 3 bytes. addr += 3; } } return addr; } // // The second pass actually translates the assembly instructions to machine code. // The machine code is written into the provided buffer. (It must be large enough.) // int secondPass(const char *source, uint8_t *code) { char line[MAX_LINE_LENGTH]; int addr = 0; const char *ptr = source; while (*ptr) { ptr = readLine(ptr, line, sizeof(line)); trim(line); if (line[0] == '\0' || line[0] == ';' || line[0] == '#') continue; char *colon = strchr(line, ':'); if (colon != NULL) { *colon = ' '; } if (strlen(line) == 0) continue; char *token = strtok(line, " ,"); if (!token) continue; char mnemonic[32]; strncpy(mnemonic, token, sizeof(mnemonic)); int opcode = getOpcode(mnemonic); code[addr++] = opcode; // Handle instructions that need operand disambiguation. if (strcmp(mnemonic, "MOV") == 0) { // Get first operand. char *dest = strtok(NULL, " ,"); char *src = strtok(NULL, " ,"); if (!dest || !src) { fprintf(stderr, "Error: MOV requires two operands.\n"); exit(1); } int opcode2 = resolveMOV(dest, src); code[addr++] = opcode2; // For the MOV instructions we decide that: // - For MOV_RN_IMM: operand bytes: [register, immediate] // - For MOV_RN_RM: operand bytes: [dest register, src register] // - For MOV_RN_ADDR: operand bytes: [dest register, address] // - For MOV_ADDR_RN: operand bytes: [address, register] if (opcode2 == MOV_RN_IMM) { int reg = parseRegister(dest); uint8_t imm = parseImmediate(src); code[addr++] = reg; code[addr++] = imm; } else if (opcode2 == MOV_RN_RM) { int regDest = parseRegister(dest); int regSrc = parseRegister(src); code[addr++] = regDest; code[addr++] = regSrc; } else if (opcode2 == MOV_RN_ADDR) { // src is memory reference like "[123]" int regDest = parseRegister(dest); // Remove the brackets. char addrStr[32]; strncpy(addrStr, src + 1, strlen(src) - 2); addrStr[strlen(src) - 2] = '\0'; uint8_t memAddr = parseImmediate(addrStr); code[addr++] = regDest; code[addr++] = memAddr; } else if (opcode2 == MOV_ADDR_RN) { // dest is a memory reference, src is a register. // Remove brackets from dest. char addrStr[32]; strncpy(addrStr, dest + 1, strlen(dest) - 2); addrStr[strlen(dest) - 2] = '\0'; uint8_t memAddr = parseImmediate(addrStr); int regSrc = parseRegister(src); code[addr++] = memAddr; code[addr++] = regSrc; } } else if (strcmp(mnemonic, "ADD") == 0 || strcmp(mnemonic, "SUB") == 0 || strcmp(mnemonic, "MUL") == 0 || strcmp(mnemonic, "DIV") == 0 || strcmp(mnemonic, "MOD") == 0 || strcmp(mnemonic, "AND") == 0 || strcmp(mnemonic, "OR") == 0 || strcmp(mnemonic, "XOR") == 0) { // ALU instructions with two operands. char *dest = strtok(NULL, " ,"); char *src = strtok(NULL, " ,"); if (!dest || !src) { fprintf(stderr, "Error: %s requires two operands.\n", mnemonic); exit(1); } int baseOpcode; if (strcmp(mnemonic, "ADD") == 0) baseOpcode = -3; else if (strcmp(mnemonic, "SUB") == 0) baseOpcode = -4; else if (strcmp(mnemonic, "MUL") == 0) baseOpcode = -5; else if (strcmp(mnemonic, "DIV") == 0) baseOpcode = -6; else if (strcmp(mnemonic, "MOD") == 0) baseOpcode = -7; else if (strcmp(mnemonic, "AND") == 0) baseOpcode = -8; else if (strcmp(mnemonic, "OR") == 0) baseOpcode = -9; else if (strcmp(mnemonic, "XOR") == 0) baseOpcode = -10; else baseOpcode = -1; int opcode3 = resolveALU(baseOpcode, src); code[addr++] = opcode3; int regDest = parseRegister(dest); code[addr++] = regDest; // For a register source, encode the register; for an immediate, encode the value. if (src[0] == 'R' || src[0] == 'r') { int regSrc = parseRegister(src); code[addr++] = regSrc; } else { uint8_t imm = parseImmediate(src); code[addr++] = imm; } } else if (strcmp(mnemonic, "NEG") == 0 || strcmp(mnemonic, "SWAPN") == 0 || strcmp(mnemonic, "NOT") == 0) { // One operand instructions. char *op = strtok(NULL, " ,"); if (!op) { fprintf(stderr, "Error: %s requires one operand.\n", mnemonic); exit(1); } int opcode4 = getOpcode(mnemonic); code[addr++] = opcode4; int reg = parseRegister(op); code[addr++] = reg; } else if (strcmp(mnemonic, "SWAP") == 0 || strcmp(mnemonic, "CMP") == 0) { // Two operand instructions: both registers. char *op1 = strtok(NULL, " ,"); char *op2 = strtok(NULL, " ,"); if (!op1 || !op2) { fprintf(stderr, "Error: %s requires two operands.\n", mnemonic); exit(1); } int opcode5 = getOpcode(mnemonic); code[addr++] = opcode5; int r1 = parseRegister(op1); int r2 = parseRegister(op2); code[addr++] = r1; code[addr++] = r2; } else if (strcmp(mnemonic, "SHL") == 0 || strcmp(mnemonic, "SHR") == 0 || strcmp(mnemonic, "SAR") == 0 || strcmp(mnemonic, "SHRS") == 0) { // Shift instructions: one register operand and one immediate. char *regToken = strtok(NULL, " ,"); char *immToken = strtok(NULL, " ,"); if (!regToken || !immToken) { fprintf(stderr, "Error: %s requires two operands.\n", mnemonic); exit(1); } int opcode6 = getOpcode(mnemonic); code[addr++] = opcode6; int reg = parseRegister(regToken); code[addr++] = reg; uint8_t imm = parseImmediate(immToken); code[addr++] = imm; } else if (strcmp(mnemonic, "JMP") == 0 || strcmp(mnemonic, "JE") == 0 || strcmp(mnemonic, "JNE") == 0 || strcmp(mnemonic, "JG") == 0 || strcmp(mnemonic, "JL") == 0 || strcmp(mnemonic, "JGE") == 0 || strcmp(mnemonic, "JLE") == 0 || strcmp(mnemonic, "CALL") == 0) { // Jump instructions: one operand which may be a label or an immediate address. char *operand = strtok(NULL, " ,"); if (!operand) { fprintf(stderr, "Error: %s requires an operand.\n", mnemonic); exit(1); } int opcode7 = getOpcode(mnemonic); code[addr++] = opcode7; // If the operand is not a number, assume it is a label. if (!isdigit(operand[0])) { int labelAddr = lookupLabel(operand); if (labelAddr < 0) { fprintf(stderr, "Error: undefined label '%s'\n", operand); exit(1); } code[addr++] = (uint8_t) labelAddr; } else { uint8_t imm = parseImmediate(operand); code[addr++] = imm; } } else if (strcmp(mnemonic, "RET") == 0 || strcmp(mnemonic, "PUSHF") == 0 || strcmp(mnemonic, "POPF") == 0) { // Instructions with no operand. int opcode8 = getOpcode(mnemonic); code[addr++] = opcode8; } else if (strcmp(mnemonic, "PUSH") == 0 || strcmp(mnemonic, "POP") == 0) { // One operand (a register) char *regToken = strtok(NULL, " ,"); if (!regToken) { fprintf(stderr, "Error: %s requires a register operand.\n", mnemonic); exit(1); } int opcode9 = getOpcode(mnemonic); code[addr++] = opcode9; int reg = parseRegister(regToken); code[addr++] = reg; } else { fprintf(stderr, "Error: Unknown instruction '%s'\n", mnemonic); exit(1); } } return addr; } void completePass(const char *input, CPU *cpu) { // First pass: determine label addresses. firstPass(input); memset(cpu->memory, 0, MEM_SIZE); // Second pass: generate machine code. secondPass(input, cpu->memory); }