Monday, January 19, 2009

Fun with assembler (on FreeBSD)

Searching is the heart of artificial intelligence, but what do you do when the core of your algorithms is much larger than it should be? Optimize!

Take this code for example

#include <assert.h>
#include <string.h>

#define NUM_NODES 10
#define MAX_STACK 10

int adjMatrix[NUM_NODES][NUM_NODES];
int visited[NUM_NODES];

int stack[MAX_STACK];
int stack_index;

void initAdjMatrix(void)
{
memset((void *)adjMatrix, 0, sizeof(adjMatrix));
memset((void *)visited, 0, sizeof(visited));
}

void makeEdge(int from, int to)
{
assert(from < NUM_NODES);
assert(to < NUM_NODES);

adjMatrix[from][to] = 1;
}

void initStack(void)
{
stack_index = 0;
}

void push(int elem)
{
assert(stack_index < MAX_STACK);
stack[stack_index++] = elem;
}

int pop(void)
{
int elem;
assert(stack_index > 0);
elem = stack[--stack_index];
return elem;
}

int empty(void)
{
if (stack_index == 0)
return 1;
return 0;
}

void dfs(int start, int goal)
{
int node, i;

push(start);
while (!empty())
{
node = pop();

if (node == goal)
{
printf("%d Goal\n", node);
return;
}

if (visited[node])
continue;
else
visited[node] = 1;

printf("%d ", node);

for (i = NUM_NODES - 1; i > -1; i--)
if (adjMatrix[node][i] == 1)
push(i);
}

printf("Goal not found!\n");
}

int main(int argc, char *argv[])
{
initAdjMatrix();
initStack();

//make our graph
makeEdge(1, 2);
makeEdge(1, 3);
makeEdge(1, 4);
makeEdge(2, 5);
makeEdge(3, 5);
makeEdge(3, 6);
makeEdge(4, 7);
makeEdge(5, 8);
makeEdge(5, 9);

dfs(1, 6);

return 0;
}

(Slightly modified version of the depth first search from "AI Application Programming" Second Edition by M. Tim Jones)

Lets run that through gcc:

[craig ~/Desktop/ai/dfs]$ gcc -o dfs dfs.c
[craig ~/Desktop/ai/dfs]$ ls -l dfs
-rwxr-xr-x 1 craig wheel 6029 Aug 18 22:53 dfs
6029 bytes!!!

Ok, lets tell gcc to optimize it:

[craig ~/Desktop/ai/dfs]$ gcc -O3 dfs.c -o dfs
[craig ~/Desktop/ai/dfs]$ ls -l dfs
-rwxr-xr-x 1 craig wheel 6274 Aug 18 22:54 dfs
Horray for sacraficing space for speed... I want both though!

Lets take a look at the code sections just out of curiosity

Sections:
Idx Name Size VMA LMA File off Algn
0 .interp 00000015 080480d4 080480d4 000000d4 2**0
CONTENTS, ALLOC, LOAD, READONLY, DATA
1 .hash 00000098 080480ec 080480ec 000000ec 2**2
CONTENTS, ALLOC, LOAD, READONLY, DATA
2 .dynsym 00000130 08048184 08048184 00000184 2**2
CONTENTS, ALLOC, LOAD, READONLY, DATA
3 .dynstr 000000cb 080482b4 080482b4 000002b4 2**0
CONTENTS, ALLOC, LOAD, READONLY, DATA
4 .rel.plt 00000030 08048380 08048380 00000380 2**2
CONTENTS, ALLOC, LOAD, READONLY, DATA
5 .init 00000011 080483b0 080483b0 000003b0 2**2
CONTENTS, ALLOC, LOAD, READONLY, CODE
6 .plt 00000070 080483c4 080483c4 000003c4 2**2
CONTENTS, ALLOC, LOAD, READONLY, CODE
7 .text 00000428 08048434 08048434 00000434 2**2
CONTENTS, ALLOC, LOAD, READONLY, CODE
8 .fini 0000000c 0804885c 0804885c 0000085c 2**2
CONTENTS, ALLOC, LOAD, READONLY, CODE
9 .rodata 00000113 08048868 08048868 00000868 2**0
CONTENTS, ALLOC, LOAD, READONLY, DATA
10 .data 0000000c 0804997c 0804997c 0000097c 2**2
CONTENTS, ALLOC, LOAD, DATA
11 .eh_frame 00000004 08049988 08049988 00000988 2**2
CONTENTS, ALLOC, LOAD, READONLY, DATA
12 .dynamic 00000098 0804998c 0804998c 0000098c 2**2
CONTENTS, ALLOC, LOAD, DATA
13 .ctors 00000008 08049a24 08049a24 00000a24 2**2
CONTENTS, ALLOC, LOAD, DATA
14 .dtors 00000008 08049a2c 08049a2c 00000a2c 2**2
CONTENTS, ALLOC, LOAD, DATA
15 .jcr 00000004 08049a34 08049a34 00000a34 2**2
CONTENTS, ALLOC, LOAD, DATA
16 .got 00000024 08049a38 08049a38 00000a38 2**2
CONTENTS, ALLOC, LOAD, DATA
17 .bss 0000022c 08049a60 08049a60 00000a60 2**5
ALLOC
18 .comment 0000012d 00000000 00000000 00000a60 2**0
CONTENTS, READONLY

hmm...that seems unnecessary

Time for a rewrite!

bits 32

; Define our constants
%define stdout 1
%define SYS_exit 1
%define SYS_write 4
%define NUM_NODES_SIZE_1 10
%define NUM_NODES 100
%define MAX_STACK 10

; Define our messages
segment .data
goalMsg db 'Goal', 0x0A
goalMsgBytes equ $ - goalMsg

spaceMsg db ' '
spaceMsgBytes equ $ - spaceMsg

goalNotFoundMsg db 'Goal not found', 0x0A
goalNotFoundMsgBytes equ $ - goalNotFoundMsg

numbers db '0123456789'
output db 0, ' '

; Declare our arrays
segment .bss
adjMatrix resb NUM_NODES
visited resb MAX_STACK

segment .text

align 4
; ==============
; = kernelCall =
; ==============
kernelCall:
int 0x80
ret

; Define our macros
%macro system 1
mov eax, %1
call kernelCall
%endmacro

%macro sys.exit 0
system SYS_exit
%endmacro

%macro sys.write 0
system SYS_write
%endmacro

; ===========================================
; = initAdjMatrix =
; = Parameters: none =
; = Pre: none =
; = Post: adjMatrix and visited are cleared =
; ===========================================
align 4
initAdjMatrix:
push eax

; Clear adjMatrix
; Clear all of eax because we reference it later
xor eax, eax
.initAdjMatrixL1:
mov [adjMatrix + eax], byte 0
inc al
cmp al, NUM_NODES
jne .initAdjMatrixL1

; Clear visited
xor al, al
.initAdjMatrixL2
mov [visited + eax], byte 0
inc al
cmp al, MAX_STACK
jne .initAdjMatrixL2

pop eax
ret

; ==============================================
; = makeEdge =
; = Parameters: from node (ebx), to node (ecx) =
; = Pre: none =
; = Post: matrix[from][to] is set =
; ==============================================
align 4
makeEdge:
mov ebx, [esp + 8]
mov ecx, [esp + 4]
; Determine row (from * 10)
; Plus column (to)
imul ebx, ebx, NUM_NODES_SIZE_1
add ebx, ecx
mov [adjMatrix + ebx], byte 1

ret

; =============================================
; = dfs =
; = Parameters: start (eax), goal (edx) =
; = Pre: Stack and matrix must be initialized =
; = Post: Graph is traversed =
; =============================================
align 4
dfs:
push ebp
mov ebp, esp

; Set start
mov eax, [esp + 12]
; Set goal
mov edx, [esp + 8]

; Make room for stack_index
sub esp, 2

; Using ebp - 2 as stack_index
mov [ebp - 2], word 0

; Push start and increase stack index
push eax
inc word [ebp - 2]
.dfsL1:
; Loop until stack_index is 0
cmp [ebp - 2], word 0
jne .dfsL12
jmp .dfsOutNotFound
.dfsL12:
; Retrieve node
pop eax
dec word [ebp - 2]

; Check if node is our goal
cmp eax, edx
je .dfsOutFound

; If we've visited node, continue
cmp [visited + eax], byte 0
jne .dfsL1
; Set visited to 1
mov [visited + eax], byte 1

; Output eax
push eax
mov al, [numbers + eax]
mov [output], al
push dword 2
push dword output
push dword stdout
sys.write
add esp, byte 12
pop eax

; Loop from 9 to 0
xor ecx, ecx
mov cl, NUM_NODES_SIZE_1
dec cl
.dfsL2:
cmp cl, -1
je .dfsL1

; Calculate place in array
push eax
imul eax, eax, NUM_NODES_SIZE_1
add eax, ecx
; Compare our adjMatrix to 1
cmp [adjMatrix + eax], byte 1
; If not 1, loop
jne .dfsL3
pop eax
; Otherwise push our index
push ecx
inc word [ebp - 2]
dec cl
jmp .dfsL2
.dfsL3:
pop eax
dec cl
jmp .dfsL2

.dfsOutFound:
; Output eax
push eax
mov al, [numbers + eax]
mov [output], al
push dword 2
push dword output
push dword stdout
sys.write
add esp, byte 12
pop eax

; Output goal message
push dword goalMsgBytes
push dword goalMsg
push dword stdout
sys.write
add esp, byte 12

jmp .dfsLeave
.dfsOutNotFound:
; Output goal not found message
push dword goalNotFoundMsgBytes
push dword goalNotFoundMsg
push dword stdout
sys.write
add esp, byte 12
.dfsLeave:
mov esp, ebp
pop ebp
ret

; ========
; = main =
; ========
global _start
_start:
call initAdjMatrix

; Make our edges
push 1
push 2
call makeEdge
push 1
push 3
call makeEdge
push 1
push 4
call makeEdge
push 2
push 5
call makeEdge
push 3
push 5
call makeEdge
push 3
push 6
call makeEdge
push 4
push 7
call makeEdge
push 5
push 8
call makeEdge
push 5
push 9
call makeEdge

; Traverse
push 1
push 6
call dfs

push dword 0
sys.exit


I've already made a few optimizations (for example changing things like "mov eax, 0" to "xor eax, eax", or had I needed it "mov eax, 1" would become "xor eax, eax" "inc eax". Yes its 2 statements but it is actually 2 bytes smaller - longer code does not always mean longer machine code)

And lets build it:

[craig ~/Desktop/ai/dfs/asm]$ nasm -f elf -l dfs.lst dfs.asm
[craig ~/Desktop/ai/dfs/asm]$ ld -s -o dfs dfs.o
[craig ~/Desktop/ai/dfs/asm]$ ls -l dfs
-rwxr-xr-x 1 craig wheel 964 Aug 18 22:57 dfs


Ok ok we are MUCH better, but we can do a little bit better. Let's take a look at the sections:

[craig ~/Desktop/ai/dfs/asm]$ objdump -x dfs
...SNIP...
Sections:
Idx Name Size VMA LMA File off Algn
0 .text 000001eb 08048080 08048080 00000080 2**4
CONTENTS, ALLOC, LOAD, READONLY, CODE
1 .data 00000021 0804926c 0804926c 0000026c 2**2
CONTENTS, ALLOC, LOAD, DATA
2 .bss 00000070 08049290 08049290 00000290 2**2
ALLOC
3 .comment 0000001f 00000000 00000000 00000290 2**0
CONTENTS, READONLY
...SNIP...

Hmm, my guess is we really don't need the .comment section.


[craig ~/Desktop/ai/dfs/asm]$ hexdump -C -s 0x290 -n 31 dfs
00000290 00 54 68 65 20 4e 65 74 77 69 64 65 20 41 73 73 |.The Netwide Ass|
000002a0 65 6d 62 6c 65 72 20 30 2e 39 38 2e 33 39 00 |embler 0.98.39.|
000002af

Looks like nasm is making us pay for room to setup the .comment section, and is making us pay for a little recognition -- lets fix that.


[craig ~/Desktop/ai/dfs/asm]$ strip -R .comment dfs
[craig ~/Desktop/ai/dfs/asm]$ ls -l dfs
-rwxr-xr-x 1 craig wheel 884 Aug 18 23:04 dfs

884 bytes! We could do better by creating a binary file and defining the elf headers ourselves, but we've already surpassed the goal of the optimizations - 6.82 times smaller!

And if anyones curious:

[craig ~/Desktop/ai/dfs]$ ls -l dfs.c
-rw-r--r-- 1 craig wheel 1373 Aug 18 21:11 dfs.c
[craig ~/Desktop/ai/dfs]$ ls -l asm/dfs.asm
-rw-r--r-- 1 craig wheel 4310 Aug 18 21:55 asm/dfs.asm
[craig ~/Desktop/ai/dfs]$ wc -l dfs.c
103 dfs.c
[craig ~/Desktop/ai/dfs]$ wc -l asm/dfs.asm
260 asm/dfs.asm

1 comment: