// // Created by bruno on 1.2.2025. // #include "assembler.h" Label labels[MAX_LABELS]; int labelCount = 0; // // Helper functions for string manipulation // void trim(char *s) { // Remove leading whitespace while (isspace((unsigned char) *s)) s++; // Remove trailing whitespace char *end = s + strlen(s) - 1; while (end > s && isspace((unsigned char) *end)) { *end = '\0'; end--; } } // Look up a label by name; returns -1 if not found. int lookupLabel(const char *name) { for (int i = 0; i < labelCount; i++) { if (strcmp(labels[i].name, name) == 0) return labels[i].address; } return -1; } // Add a label to the table int addLabel(const char *name, uint32_t address) { if (labelCount >= MAX_LABELS) { fprintf(stderr, "Too many labels!\n"); return 1; } strncpy(labels[labelCount].name, name, sizeof(labels[labelCount].name)); labels[labelCount].address = address; labelCount++; } // // Parse a register string (e.g., "R0", "R1", etc.) and return it's number. // Returns -1 on error. int parseRegister(const char *token) { if (token[0] == 'R' || token[0] == 'r') { int reg = atoi(token + 1); if (reg >= 0 && reg < REG_COUNT) return reg; } return -1; } // Parse an immediate value (supports decimal and 0x... hexadecimal) uint8_t parseImmediate(const char *token) { int16_t value = 0; // Temporary variable as signed int16_t // Check if the value starts with '0x' or '0X' for hexadecimal if (strlen(token) > 2 && token[0] == '0' && (token[1] == 'x' || token[1] == 'X')) { // Handle hexadecimal: Check for signed hex (e.g. -0x1F4, +0x1F4) if (token[2] == '+' || token[2] == '-') { sscanf(token, "%hx", &value); // Hexadecimal signed (same as unsigned in sscanf) if (token[2] == '-') value = -value; // Adjust sign if negative } else { // Hexadecimal unsigned value sscanf(token, "%hx", &value); } } else { // Check if the value has a signed prefix (+ or -) for decimal if (token[0] == '+' || token[0] == '-') { sscanf(token, "%hd", &value); // Signed decimal } else { // Unsigned decimal value unsigned int unsigned_value; sscanf(token, "%u", &unsigned_value); value = (int16_t) unsigned_value; // Cast unsigned to signed } } // Convert signed 16-bit value to unsigned 8-bit value // Ensure the value fits within the range of uint8_t (0 to 255) return (uint8_t) (value & 0xFF); // Mask with 0xFF to discard upper bits } void toUpperCase(char *string) { while (*string) { if (*string > 0x60 && *string < 0x7b) { (*string) -= 0x20; } if (*string == '\r') { *string = ' '; } string++; } } void toLowerCase(char *string) { while (*string) { if (*string >= 'A' && *string <= 'Z') { (*string) += 0x20; } string++; } } // // Map an instruction mnemonic (string) to its opcode value and expected operand types. // For simplicity, we will return the opcode value and then in our parser we’ll decide how many operands to expect. // (In a full assembler you might use a more sophisticated data structure.) // int getOpcode(char *mnemonic) { toUpperCase(mnemonic); if (strcmp(mnemonic, "NOP") == 0) return NOP; else if (strcmp(mnemonic, "BRK") == 0) return BRK; else if (strcmp(mnemonic, "HLT") == 0) return HLT; else if (strcmp(mnemonic, "MOV") == 0) return -2; // Special case: we must decide between MOV_IMM_RN, MOV_RN_RM, MOV_RN_ADDR, MOV_ADDR_RN else if (strcmp(mnemonic, "SWAP") == 0) return SWAP; else if (strcmp(mnemonic, "SWAPN") == 0) return SWAPN; else if (strcmp(mnemonic, "ADD") == 0) return -3; // Special: decide between ADD_RN_RM and ADD_RN_IMM else if (strcmp(mnemonic, "SUB") == 0) return -4; // Special: decide between SUB_RN_RM and SUB_RN_IMM else if (strcmp(mnemonic, "MUL") == 0) return -5; // Special: decide between MUL_RN_RM and MUL_RN_IMM else if (strcmp(mnemonic, "DIV") == 0) return -6; // Special: decide between DIV_RN_RM and DIV_RN_IMM else if (strcmp(mnemonic, "MOD") == 0) return -7; // Special: decide between MOD_RN_RM and MOD_RN_IMM else if (strcmp(mnemonic, "NEG") == 0) return NEG_RN; else if (strcmp(mnemonic, "AND") == 0) return -8; // Special: decide between AND_RN_RM and AND_RN_IMM else if (strcmp(mnemonic, "OR") == 0) return -9; // Special: decide between OR_RN_RM and OR_RN_IMM else if (strcmp(mnemonic, "XOR") == 0) return -10; // Special: decide between XOR_RN_RM and XOR_RN_IMM else if (strcmp(mnemonic, "NOT") == 0) return NOT_RN; else if (strcmp(mnemonic, "SHL") == 0) return SHL_RN_IMM; else if (strcmp(mnemonic, "SHR") == 0) return SHR_RN_IMM; else if (strcmp(mnemonic, "SAR") == 0) return SAR_RN_IMM; else if (strcmp(mnemonic, "JMP") == 0) return -11; //Special: if + or - present choose JMP_REL, otherwise JMP else if (strcmp(mnemonic, "INC") == 0) return -12; //Special: decide between INC_RN and INC_ADDR else if (strcmp(mnemonic, "DEC") == 0) return -13; //Special: decide between DEC_RN and DEC_ADDR else if (strcmp(mnemonic, "CMP") == 0) return CMP; else if (strcmp(mnemonic, "JE") == 0) return JE; else if (strcmp(mnemonic, "JNE") == 0) return JNE; else if (strcmp(mnemonic, "JMPBS") == 0) return -14; //Special: decide between BIT_TS_RN and BIT_TS_ADDR else if (strcmp(mnemonic, "JMPBC") == 0) return -15; //Special: decide between BIT_TC_RN and BIT_TC_ADDR else if (strcmp(mnemonic, "BITS") == 0) return -16; //Special: decide between BITS_RN and BITS_ADDR else if (strcmp(mnemonic, "BITC") == 0) return -17; //Special: decide between BITC_RN and BITC_ADDR else if (strcmp(mnemonic, "JG") == 0) return JG; else if (strcmp(mnemonic, "JL") == 0) return JL; else if (strcmp(mnemonic, "JGE") == 0) return JGE; else if (strcmp(mnemonic, "JLE") == 0) return JLE; else if (strcmp(mnemonic, "CALL") == 0) return CALL; else if (strcmp(mnemonic, "RET") == 0) return RET; else { return -1; } } // // In this simple assembler, some instructions share a mnemonic, and we must choose the correct opcode // based on the type of the operand (register vs. immediate vs. memory). // The following helper functions decide that, given two operands (as strings). // // For example, "MOV Rn, 42" should choose MOV_IMM_RN, while "MOV Rn, Rm" should choose MOV_RN_RM. // We assume that memory addresses are written in square brackets, e.g. "[123]". // int resolveMOV(const char *dest, const char *src) { // If dest starts with '[' then it is a memory destination. if (dest[0] == '[') return MOV_RN_ADDR; // actually, MOV [Addr], Rn expects Rn in second operand // Otherwise, dest is a register. // Now, check src: if ((dest[0] == 'R' || dest[0] == 'r') && (src[0] == 'R' || src[0] == 'r')) { return MOV_RN_RM; } else if (src[0] == '[') { return MOV_ADDR_RN; } else { return MOV_IMM_RN; } } int resolveALU(int baseOpcode, const char *src, const char *dest) { // baseOpcode is one of our special negative values for ADD, SUB, etc. if (dest[0] == 'R' || dest[0] == 'r') { switch (baseOpcode) { case -3: return ADD_RN_RM; case -4: return SUB_RN_RM; case -5: return MUL_RN_RM; case -6: return DIV_RN_RM; case -7: return MOD_RN_RM; case -8: return AND_RN_RM; case -9: return OR_RN_RM; case -10: return XOR_RN_RM; default: return -1; } } else if (src[0] == 'r' || src[0] == 'R' && baseOpcode < -11) { switch (baseOpcode) { case -12: return INC_RN; case -13: return DEC_RN; case -14: return BIT_TS_RN; case -15: return BIT_TC_RN; case -16: return BITS_RN; case -17: return BITC_RN; default: return -1; } } else if (src[0] == '+' || src[0] == '-') { switch (baseOpcode) { case -11: return JMP_REL; default: return -1; } } else { switch (baseOpcode) { case -3: return ADD_RN_IMM; case -4: return SUB_RN_IMM; case -5: return MUL_RN_IMM; case -6: return DIV_RN_IMM; case -7: return MOD_RN_IMM; case -8: return AND_RN_IMM; case -9: return OR_RN_IMM; case -10: return XOR_RN_IMM; case -11: return JMP; case -12: return INC_ADDR; case -13: return DEC_ADDR; case -14: return BIT_TS_ADDR; case -15: return BIT_TC_ADDR; case -16: return BITS_ADDR; case -17: return BITC_ADDR; default: return -1; } } } // Reads a single line from the source string. const char *readLine(const char *source, char *buffer, size_t maxLen) { size_t i = 0; while (*source && *source != '\n' && i < maxLen - 1) { buffer[i++] = *source++; } buffer[i] = '\0'; return (*source == '\n') ? source + 1 : source; } // // The first pass scans the assembly source file to record all labels and their addresses. // The address is simply the offset into the output machine code buffer. // For this example, every instruction is assumed to have a fixed length (opcode plus operand bytes). // void firstPass(const char *source) { char line[MAX_LINE_LENGTH]; int addr = 0; labelCount = 0; const char *ptr = source; while (*ptr) { ptr = readLine(ptr, line, sizeof(line)); trim(line); // Skip blank lines or comments. if (line[0] == '\0' || line[0] == ';' || line[0] == '#') continue; // Process labels. char *colon = strchr(line, ':'); if (colon != NULL) { *colon = '\0'; trim(line); addLabel(line, addr); char *rest = colon + 1; trim(rest); if (strlen(rest) == 0) continue; strcpy(line, rest); } // Parse the mnemonic and operands. char mnemonic[32], operand1[64], operand2[64], operand3[64]; operand1[0] = '\0'; operand2[0] = '\0'; sscanf(line, "%31s %63[^ ] %63[^ ] %63s", mnemonic, operand1, operand2, operand3); // Use the mapper to get a base opcode. int baseOpcode = getOpcode(mnemonic); if (baseOpcode == -1) { printf("Unknown instruction: %s\n", mnemonic); continue; } addr += CPU_INSTRUCTION_SIZE; } } // // The second pass actually translates the assembly instructions to machine code. // The machine code is written into the provided buffer. (It must be large enough.) // uint32_t completePass(const char *source, CPU *cpu, bool erase) { if (erase) { memset(cpu->memory, 0, sizeof(cpu->memory)); memset(cpu->regs, 0, sizeof(cpu->regs)); memset(cpu->stack, 0, sizeof(cpu->stack)); memset(cpu->addrToLineMapper, 0, sizeof(cpu->addrToLineMapper)); cpu->pc = 0; cpu->stack_ptr = 0; cpu->flags = 0; cpu->cycle = 0; } // First pass: determine label addresses. firstPass(source); char line[MAX_LINE_LENGTH]; uint32_t addr = 0; const char *ptr = source; uint32_t lineIndex = 0; while (*ptr) { ptr = readLine(ptr, line, sizeof(line)); trim(line); if (line[0] == '\0' || line[0] == ';' || line[0] == '#') { lineIndex++; continue; } // Stop at the first semicolon. char *semicolon = strchr(line, ';'); if (semicolon != NULL) { *semicolon = '\0'; // Terminate the string at the semicolon trim(line); // Trim again in case spaces remain if (line[0] == '\0') { lineIndex++; continue; } } // Remove any label definitions (up to the colon). char *colon = strchr(line, ':'); if (colon != NULL) { lineIndex++; continue; } if (strlen(line) == 0) continue; // Parse mnemonic and up to three operands. char mnemonic[32], operand1[64], operand2[64], operand3[64]; mnemonic[0] = operand1[0] = operand2[0] = operand3[0] = '\0'; sscanf(line, "%31s %63[^ ] %63[^ ] %63s", mnemonic, operand1, operand2, operand3); // (Optionally, you might trim each operand individually here.) uint32_t oldAddr = addr; // Map the mnemonic to a base opcode. int baseOpcode = getOpcode(mnemonic); if (baseOpcode == -1) { fprintf(stderr, "Unknown instruction: %s\n", mnemonic); return 1; } // --- MOV Instruction (baseOpcode == -2) --- if (baseOpcode == -2) { if (strlen(operand1) == 0 || strlen(operand2) == 0) { fprintf(stderr, "Error: MOV requires two operands.\n"); return 1; } int resolvedOpcode = resolveMOV(operand2, operand1); cpu->memory[addr++] = resolvedOpcode; if (resolvedOpcode == MOV_IMM_RN) { uint8_t imm = parseImmediate(operand1); int reg = parseRegister(operand2); cpu->memory[addr++] = imm; cpu->memory[addr++] = reg; } else if (resolvedOpcode == MOV_RN_RM) { int regSrc = parseRegister(operand1); int regDest = parseRegister(operand2); cpu->memory[addr++] = regSrc; cpu->memory[addr++] = regDest; } else if (resolvedOpcode == MOV_ADDR_RN) { // Assume source is written as "[address]": remove the brackets. char addrStr[32]; strncpy(addrStr, operand1 + 1, strlen(operand1) - 2); addrStr[strlen(operand1) - 2] = '\0'; uint32_t memAddr = (uint32_t) strtoul(addrStr, NULL, 0); int reg = parseRegister(operand2); cpu->memory[addr++] = memAddr & 0xFF; cpu->memory[addr++] = (memAddr >> 8) & 0xFF; cpu->memory[addr++] = (memAddr >> 16) & 0xFF; cpu->memory[addr++] = reg; } else if (resolvedOpcode == MOV_RN_ADDR) { // Destination is memory (written as "[address]"). char addrStr[32]; strncpy(addrStr, operand2 + 1, strlen(operand2) - 2); addrStr[strlen(operand2) - 2] = '\0'; int reg = parseRegister(operand1); uint32_t memAddr = (uint32_t) strtoul(addrStr, NULL, 0); cpu->memory[addr++] = reg; cpu->memory[addr++] = memAddr & 0xFF; cpu->memory[addr++] = (memAddr >> 8) & 0xFF; cpu->memory[addr++] = (memAddr >> 16) & 0xFF; } } // --- INC and DEC (baseOpcode == -12 or -13) --- // These instructions require only a single operand. else if (baseOpcode == -12 || baseOpcode == -13) { if (strlen(operand1) == 0) { fprintf(stderr, "Error: %s requires one operand.\n", mnemonic); return 1; } int resolvedOpcode = resolveALU(baseOpcode, operand1, operand2); cpu->memory[addr++] = resolvedOpcode; if (operand1[0] == 'R' || operand1[0] == 'r') { int reg = parseRegister(operand1); cpu->memory[addr++] = reg; } else { // Assume memory reference written as "[address]". char addrStr[32]; strncpy(addrStr, operand1 + 1, strlen(operand1) - 2); addrStr[strlen(operand1) - 2] = '\0'; uint32_t memAddr = (uint32_t) strtoul(addrStr, NULL, 0); cpu->memory[addr++] = memAddr & 0xFF; cpu->memory[addr++] = (memAddr >> 8) & 0xFF; cpu->memory[addr++] = (memAddr >> 16) & 0xFF; } } // --- Other Ambiguous ALU Instructions (ADD, SUB, MUL, etc.) --- // These require two operands (destination and source). else if (baseOpcode < 0 && baseOpcode != -2 && baseOpcode != -11 && baseOpcode != -14 && baseOpcode != -15 && baseOpcode != -12 && baseOpcode != -13) { if (strlen(operand1) == 0 || strlen(operand2) == 0) { fprintf(stderr, "Error: %s requires two operands.\n", mnemonic); return 1; } int resolvedOpcode = resolveALU(baseOpcode, operand1, operand2); cpu->memory[addr++] = resolvedOpcode; int regDest = parseRegister(operand1); cpu->memory[addr++] = regDest; if (operand2[0] == 'R' || operand2[0] == 'r') { int regSrc = parseRegister(operand2); cpu->memory[addr++] = regSrc; } else { uint8_t imm = parseImmediate(operand2); cpu->memory[addr++] = imm; } } // --- JMP Instruction (baseOpcode == -11) --- else if (baseOpcode == -11) { if (strlen(operand1) == 0) { fprintf(stderr, "Error: JMP requires one operand.\n"); return 1; } int resolvedOpcode = resolveALU(baseOpcode, operand1, operand2); cpu->memory[addr++] = resolvedOpcode; if (operand1[0] == '+' || operand1[0] == '-') { // Relative jump: one-byte offset. uint8_t offset = parseImmediate(operand1); cpu->memory[addr++] = offset; } else { // Absolute jump: use label lookup for 32-bit address. uint32_t jumpAddr = (uint32_t) lookupLabel(operand1); cpu->memory[addr++] = jumpAddr & 0xFF; cpu->memory[addr++] = (jumpAddr >> 8) & 0xFF; cpu->memory[addr++] = (jumpAddr >> 16) & 0xFF; } } // --- Jump Bit Set/Clear Instructions (JMPBS, JMPBC) --- else if (baseOpcode == -14 || baseOpcode == -15) { if (strlen(operand1) == 0 || strlen(operand2) == 0 || strlen(operand3) == 0) { fprintf(stderr, "Error: %s requires three operands.\n", mnemonic); return 1; } int resolvedOpcode = resolveALU(baseOpcode, operand1, operand2); cpu->memory[addr++] = resolvedOpcode; // Encode the source operand (register or memory). if (operand1[0] == 'R' || operand1[0] == 'r') { int reg = parseRegister(operand1); cpu->memory[addr++] = reg; } else { char addrStr[32]; strncpy(addrStr, operand1 + 1, strlen(operand1) - 2); addrStr[strlen(operand1) - 2] = '\0'; uint32_t memAddr = (uint32_t) strtoul(addrStr, NULL, 0); cpu->memory[addr++] = memAddr & 0xFF; cpu->memory[addr++] = (memAddr >> 8) & 0xFF; cpu->memory[addr++] = (memAddr >> 16) & 0xFF; } // Encode the bit number (a one-byte immediate). uint8_t bitVal = parseImmediate(operand2); cpu->memory[addr++] = bitVal; // Encode the jump target (label -> 32-bit address). uint32_t jumpAddr = (uint32_t) lookupLabel(operand3); cpu->memory[addr++] = jumpAddr & 0xFF; cpu->memory[addr++] = (jumpAddr >> 8) & 0xFF; cpu->memory[addr++] = (jumpAddr >> 16) & 0xFF; } // --- Non-ambiguous Instructions --- else if (baseOpcode >= 0) { switch (baseOpcode) { case CMP: case SWAP: { if (strlen(operand1) == 0 || strlen(operand2) == 0) { fprintf(stderr, "Error: %s requires two operands.\n", mnemonic); return 1; } cpu->memory[addr++] = baseOpcode; int r1 = parseRegister(operand1); int r2 = parseRegister(operand2); cpu->memory[addr++] = r1; cpu->memory[addr++] = r2; break; } case SWAPN: case NEG_RN: case NOT_RN: { if (strlen(operand1) == 0) { fprintf(stderr, "Error: %s requires one operand.\n", mnemonic); return 1; } cpu->memory[addr++] = baseOpcode; int reg = parseRegister(operand1); cpu->memory[addr++] = reg; break; } case SHL_RN_IMM: case SHR_RN_IMM: case SAR_RN_IMM: { if (strlen(operand1) == 0 || strlen(operand2) == 0) { fprintf(stderr, "Error: %s requires two operands.\n", mnemonic); return 1; } cpu->memory[addr++] = baseOpcode; int reg = parseRegister(operand1); cpu->memory[addr++] = reg; uint8_t imm = parseImmediate(operand2); cpu->memory[addr++] = imm; break; } case JE: case JNE: case JG: case JL: case JGE: case JLE: case CALL: { if (strlen(operand1) == 0) { fprintf(stderr, "Error: %s requires one operand.\n", mnemonic); return 1; } cpu->memory[addr++] = baseOpcode; // If the operand isn’t purely numeric, treat it as a label. if (!isdigit(operand1[0])) { int labelAddr = lookupLabel(operand1); if (labelAddr < 0) { fprintf(stderr, "Error: undefined label '%s'\n", operand1); return 1; } cpu->memory[addr++] = labelAddr & 0xFF; cpu->memory[addr++] = (labelAddr >> 8) & 0xFF; cpu->memory[addr++] = (labelAddr >> 16) & 0xFF; } else { uint32_t immAddr = (uint32_t) strtoul(operand1, NULL, 0); cpu->memory[addr++] = immAddr & 0xFF; cpu->memory[addr++] = (immAddr >> 8) & 0xFF; cpu->memory[addr++] = (immAddr >> 16) & 0xFF; } break; } case RET: case BRK: case HLT: case NOP: { cpu->memory[addr++] = baseOpcode; break; } default: { fprintf(stderr, "Error: Unhandled opcode %d\n", baseOpcode); return 1; } } } else { fprintf(stderr, "Error: Unknown instruction '%s'\n", mnemonic); return 1; } const uint32_t remainingBytes = CPU_INSTRUCTION_SIZE - (addr - oldAddr); if (remainingBytes > CPU_INSTRUCTION_SIZE) { printf("HELP, INSTRUCTION SIZE SMALLER THAN INSTRUCTION\n"); } cpu->addrToLineMapper[(addr - (addr % CPU_INSTRUCTION_SIZE)) / CPU_INSTRUCTION_SIZE] = lineIndex; addr += remainingBytes; lineIndex++; } cpu->programEnd = addr; return addr; }