Right now I'm learning assembly right now and my project is to translate byte code from a fantasy architecture to real assembly written in byte code and executing with JIT.
In order to do this, I had to implement instructions from this other architecture. Some of them were simple, just like the common assembly instructions, there were two of them that needed more bytes to implement, like these two:
RX and RY - 32 bit registers memory - array of bytes ordered in little endian that contains the original byte code
mov RX, memory[RY] - reads the next 4 bytes (starting in RY) in memory, shift right those bytes and concatenates them in RX. mov memory[RX], RY - the inverse operation. Reads the value in RY, shift left the bytes to order them in little endian.
In C code, these instructions would be (considering R and mem are global):
// mov RX, mem[RY]
void movRxMemRy(unsigned char x, unsigned char y) {
if (R[y]+3 > 128) endExecution = 1;
else R[x] = mem[R[y]+3] << 24 | mem[R[y]+2] << 16 | mem[R[y]+1] << 8 | mem[R[y]];
}
// mov mem[RX], RY
void movMemRxRy(unsigned char x, unsigned char y) {
if (R[x]+3 > 128) {
endExecution = 1;
} else {
mem[R[x]] = (R[y]);
mem[R[x]+1] = (R[y]) >> 8;
mem[R[x]+2] = (R[y]) >> 16;
mem[R[x]+3] = (R[y]) >> 24;
}
return;
}
These instructions were implemented as part of the interpreter, which should be around 5-10x slower (or more) than the assembly/jit implementation, but right now it takes 1/3 of the time (around 1,7~1,8s) to run these instructions. Our implementation of the instructions has to run the following instructions that the professor gave us on the original byte code:
mov R0, 0x006C
mov R1, 0x0001
mov R2, [R0] # start of huge the loop. [R0] contains the loop counter
cmp R15, R2 # R15 = 0
je 0x0030 # ends the loop execution
mov R14, R2
add R13, R14
sub R2, R1 # decrements the loop counter by 1
mov [R0], R2 # saves the loop counter
jmp 0xFFC8 # returns to the start of the loop
Since the loop is responsible for more than 99% of the execution time, I decided to paste only this part here. The loop counter is the value contained in [R0] and it starts in 0x03885533 (decremented by on 1 each iteration). It quits the loop once the value reaches zero.
The most complex instructions are the ones responsible for most of the execution time, in addition to the add and sub instructions. I need to optimize them to be faster and, if possible, use the least amount of bytes, because I believe that there may be something wrong with them since it is running in 4,5s. The assembly/jit version is faster than the interpreted version and has to run in less than 1 second (the time limit for this project). My current implementation of them is:
r15: contains the array of 16 32-bit "registers" from the fantasy architecture used to store the final result rbx: contains the memory array that has the original byte code (and the counter value).
// Sub instruction is the same as the add instruction, but changing the opcode byte.
void add(unsigned char opcode, unsigned char x, unsigned char y) {
start = c;
// 0x09 - add rx, ry
// mov r14d, [r15+4*y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 4*y;
// add [r15+4x], r14d
machine[c++] = 0x45;
machine[c++] = 0x01;
machine[c++] = 0x77;
machine[c++] = 4*x;
end = c;
for (k= 0; k < (88 - (end-start)); k++) {
machine[c++] = 0x90;
}
end = c;
}
// mov RX, mem[RY]
void movRxMemRy(unsigned char opcode, unsigned char x, unsigned char y) {
// xor r14, r14
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xf6;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// xor r12, r12
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xe4;
// mov r12d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x67;
machine[c++] = 0x4*y;
// mov r13b,BYTE PTR [rbx+r12*1+0x3]
machine[c++] = 0x46;
machine[c++] = 0x8a;
machine[c++] = 0x6c;
machine[c++] = 0x23;
machine[c++] = 0x03;
// shl r13,0x18
machine[c++] = 0x49;
machine[c++] = 0xc1;
machine[c++] = 0xe5;
machine[c++] = 0x18;
// or r14,r13
machine[c++] = 0x4d;
machine[c++] = 0x09;
machine[c++] = 0xee;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// mov r13b,BYTE PTR [rbx+r12*1+0x2]
machine[c++] = 0x46;
machine[c++] = 0x8a;
machine[c++] = 0x6c;
machine[c++] = 0x23;
machine[c++] = 0x02;
// shl r13,0x10
machine[c++] = 0x49;
machine[c++] = 0xc1;
machine[c++] = 0xe5;
machine[c++] = 0x10;
// or r14,r13
machine[c++] = 0x4d;
machine[c++] = 0x09;
machine[c++] = 0xee;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// mov r13b,BYTE PTR [rbx+r12*1+0x1]
machine[c++] = 0x46;
machine[c++] = 0x8a;
machine[c++] = 0x6c;
machine[c++] = 0x23;
machine[c++] = 0x01;
// shl r13,0x18
machine[c++] = 0x49;
machine[c++] = 0xc1;
machine[c++] = 0xe5;
machine[c++] = 0x08;
// or r14,r13
machine[c++] = 0x4d;
machine[c++] = 0x09;
machine[c++] = 0xee;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// mov r13b,BYTE PTR [rbx+r12*1]
machine[c++] = 0x46;
machine[c++] = 0x8a;
machine[c++] = 0x2c;
machine[c++] = 0x23;
// or r14,r13
machine[c++] = 0x4d;
machine[c++] = 0x09;
machine[c++] = 0xee;
// mov r13b,BYTE PTR [rbx+r12*1+0x3]
machine[c++] = 0x45;
machine[c++] = 0x89;
machine[c++] = 0x77;
machine[c++] = x*4;
end = c;
}
void movMemRxRy(unsigned char opcode, unsigned char x, unsigned char y) {
start = c;
// xor r14, r14
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xf6;
// xor r13, r13
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xed;
// xor r12, r12
machine[c++] = 0x4d;
machine[c++] = 0x31;
machine[c++] = 0xe4;
// r12d,DWORD PTR [r15+0xc] (atual)
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x67;
machine[c++] = 0x4*x;
// mov r14d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 0x4*y;
// mov DWORD PTR [rbx+r12*1], r14b
machine[c++] = 0x46;
machine[c++] = 0x88;
machine[c++] = 0x34;
machine[c++] = 0x23;
// mov r14d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 0x4*y;
// shl r14d, 0x8
machine[c++] = 0x41;
machine[c++] = 0xc1;
machine[c++] = 0xee;
machine[c++] = 0x08;
// mov BYTE PTR [rbx+r12*1+0x1],r14b
machine[c++] = 0x46;
machine[c++] = 0x88;
machine[c++] = 0x74;
machine[c++] = 0x23;
machine[c++] = 0x01;
// mov r14d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 0x4*y;
// shl r14d, 0x10
machine[c++] = 0x41;
machine[c++] = 0xc1;
machine[c++] = 0xee;
machine[c++] = 0x10;
// mov BYTE PTR [rbx+r12*1+0x2],r14b
machine[c++] = 0x46;
machine[c++] = 0x88;
machine[c++] = 0x74;
machine[c++] = 0x23;
machine[c++] = 0x02;
// mov r14d,DWORD PTR [r15+4*Y]
machine[c++] = 0x45;
machine[c++] = 0x8b;
machine[c++] = 0x77;
machine[c++] = 0x4*y;
// shl r14d, 0x18
machine[c++] = 0x41;
machine[c++] = 0xc1;
machine[c++] = 0xee;
machine[c++] = 0x18;
// mov BYTE PTR [rbx+r12*1+0x2],r14b
machine[c++] = 0x46;
machine[c++] = 0x88;
machine[c++] = 0x74;
machine[c++] = 0x23;
machine[c++] = 0x03;
end = c;
for (k= 0; k < (88 - (end-start)); k++) {
machine[c++] = 0x90;
}
end = c;
}
Since for the first project we don't need the conditional which check if the Rn+3 > 128 , I decided to make it work first without having to implement the conditional on the assembly code.
Any ideas on how I could improve my code to run below 1s? Any help will be appreciated.