1 / 66

嵌入式處理器架構與 程式設計

嵌入式處理器架構與 程式設計. 王建民 中央研究院 資訊所 2008 年 7 月. Contents. Introduction Computer Architecture ARM Architecture Development Tools GNU Development Tools ARM Instruction Set ARM Assembly Language ARM Assembly Programming GNU ARM ToolChain Interrupts and Monitor.

lynde
Download Presentation

嵌入式處理器架構與 程式設計

An Image/Link below is provided (as is) to download presentation Download Policy: Content on the Website is provided to you AS IS for your information and personal use and may not be sold / licensed / shared on other websites without getting consent from its author. Content is provided to you AS IS for your information and personal use only. Download presentation by click this link. While downloading, if for some reason you are not able to download a presentation, the publisher may have deleted the file from their server. During download, if you can't get a presentation, the file might be deleted by the publisher.

E N D

Presentation Transcript


  1. 嵌入式處理器架構與程式設計 王建民 中央研究院 資訊所 2008年 7月

  2. Contents • Introduction • Computer Architecture • ARM Architecture • Development Tools • GNU Development Tools • ARM Instruction Set • ARM Assembly Language • ARM Assembly Programming • GNU ARM ToolChain • Interrupts and Monitor

  3. Lecture 8ARM Assembly Programming

  4. Outline • Assembly Programming • Assembly-C Interface • Peephole Optimization

  5. Example #4: String Length #include <stdio.h> extern int mystrlen(char *s); int main() { char s[20] = “Hello, World!\n”; printf("The length of the string is %d\n", mystrlen(s)); } int mystrlen(char *s1) { char *s2; s2 = s1; while (*s2 != 0) { s2++; } return (s2-s1); }

  6. int mystrlen(char *s1) { char *s2; s2 = s1; while (*s2 != 0) { s2++; } return (s2-s1); } mystrlen: s2 = s1 start_loop: if (*s2 == 0) goto end_loop s2 = s2 + 1 goto start_loop end_loop: return (s2-s1) Example #4: Pseudo Code

  7. mystrlen: s2 = s1 start_loop: if (*s2 == 0) goto end_loop s2 = s2 + 1 goto start_loop end_loop: return (s2-s1) mystrlen: r4 = r0 start_loop: r5 = *r4 if (r5 == 0) goto end_loop s4 = r4 + 1 goto start_loop end_loop: return (r4-r0) Example #4: Storage Assignment

  8. mystrlen: r4 = r0 start_loop: r5 = *r4 if (r5 == 0) goto end_loop r4 = r4 + 1 goto start_loop end_loop: return (r4-r0) .text .align 2 .global mystrlen mystrlen: mov r4, r0 start_loop: ldrb r5, [r4] cmp r5, #0 beq end_loop add r4, r4, #1 b start_loop end_loop: sub r0, r4, r0 mov pc, lr Example #4: Final Assembly Code

  9. Example #5: Summation #include <stdio.h> extern int mysum(int n, int *array); int main() { int a[5] = {1, 3, 5, 7, 9}; printf("The summation of the array is %d\n", mysum(5,a)); } int mysum(int n, int *array) { int i, sum; sum = 0; for (i = 0; i < n; i++) { sum += array[i]; } return sum; }

  10. int mysum(int n, int *array) { int i, sum; sum = 0; for (i = 0; i < n; i++) { sum += array[i]; } return sum; } mysum: sum = 0 i = 0 start_loop: if (i >= n) goto end_loop sum = sum + array[i] i = i + 1 goto start_loop end_loop: return sum Example #5: Pseudo Code

  11. mysum: sum = 0 i = 0 start_loop: if (i >= n) goto end_loop sum = sum + array[i] i = i + 1 goto start_loop end_loop: return sum mysum: r5 = 0 r4 = 0 start_loop: if (r4 >= r0) goto end_loop r6 = r1[r4] r5 = r5 + r6 r4 = r4 + 1 goto start_loop end_loop: return r5 Example #5: Storage Assignment

  12. mysum: r5 = 0 r4 = 0 start_loop: if (r4 >= r0) goto end_loop r6 = r1[r4] r5 = r5 + r6 r4 = r4 + 1 goto start_loop end_loop: return r5 .text .align 2 .global mysum mysum: mov r5, #0 mov r4, #0 start_loop: cmp r4, r0 bge end_loop ldr r6, [r1,r4,LSL#2] add r5, r5, r6 add r4, r4, #1 b start_loop end_loop: mov r0, r5 mov pc, lr Example #5: Final Assembly Code

  13. Example #6: Bubble Sort1 #include <stdio.h> extern void bubble(int n, int *a); int main() { int i; int a[5] = {9, 7, 5, 3, 1}; bubble(5, a); printf("The sorted array:\n"); for (i = 0; i < 5; i++) { printf("a[%d] = %d\n", i, a[i]); } }

  14. Example #6: Bubble Sort2 void sort2(int *a, int *b) { int tmp; if (*b < *a) { tmp = *a; *a = *b; *b = tmp; } } void bubble(int n, int *a) { int i, j; for (i = 0; i < n-1; i++) { for (j = 0; j < n-1-i; j++) { sort2(&a[j], &a[j+1]); } } }

  15. void bubble(int n, int *a); { int i, j; for (i = 0; i < n-1; i++) { for (j = 0; j < n-1-i; j++) { sort2(&a[j], &a[j+1]); } } } bubble: i = 0 start_outer: if (i >= n-1) goto end_outer j = 0 start_inner: if (j >= n-1-i) goto end_inner sort2(&a[j],&a[j+1]) j = j + 1 goto start_inner end_inner: i = i + 1 goto start_outer end_outer: return Example #6: Pseudo Code

  16. bubble: i = 0 start_outer: if (i >= n-1) goto end_outer j = 0 start_inner: if (j >= n-1-i) goto end_inner sort2(&a[j],&a[j+1]) j = j + 1 goto start_inner end_inner: i = i + 1 goto start_outer end_outer: return bubble: r2 = 0 start_outer: r4 = r0 - 1 if (r2 >= r4) goto end_outer r3 = 0 start_inner: r5 = r4 – r2 if (r3 >= r5) goto end_inner sort2(r1+r3*4,r1+r3*4+4) r3 = r3 + 1 goto start_inner end_inner: r2 = r2 + 1 goto start_outer end_outer: return Example #6: Storage Assignment

  17. bubble: r2 = 0 start_outer: r4 = r0 - 1 if (r2 >= r4) goto end_outer r3 = 0 start_inner: r5 = r4 – r2 if (r3 >= r5) goto end_inner sort2(r1+r3*4,r1+r3*4+4) r3 = r3 + 1 goto start_inner end_inner: r2 = r2 + 1 goto start_outer end_outer: return bubble: mov r2, #0 start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0 start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2 add r3, r3, #1 b start_inner end_inner: add r2, r2, #1 b start_outer end_outer: mov pc, lr Example #6: Assembly Code?

  18. bubble: mov r2, #0 start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0 start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2 add r3, r3, #1 b start_inner end_inner: add r2, r2, #1 b start_outer end_outer: mov pc, lr bubble: mov r2, #0 start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0 start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner stmfd sp!,{r0-r3,lr} add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2 ldmfd sp, {r0-r3,lr} add r3, r3, #1 b start_inner end_inner: add r2, r2, #1 b start_outer end_outer: mov pc, lr Example #6: Final Assembly Code

  19. Outline • Assembly Programming • Assembly-C Interface • Peephole Optimization

  20. Generating Assembly Code from C • In this course, we will be using the GNU ARM ToolChain. • To compile a C program to assembly code • arm-elf-gcc –S filename.c • When you compile a .c file, you get a .s file • This .s file contains the assembly language code • When assembled, this code can potentially be linked and loaded as an executable • To display information from an object file • arm-elf-objdump –S –r filename

  21. int a, b; int main() { a = 3; b = 4; } /* end main() */ .file "example4.c" .text .align 2 .global main .type main, %function main: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 ldr r2, .L3 mov r3, #3 str r3, [r2, #0] ldr r2, .L3+4 mov r3, #4 str r3, [r2, #0] ldmfd sp, {fp, sp, pc} .L4: .align 2 .L3: .word a .word b .size main, .-main .comm a,4,4 .comm b,4,4 .ident "GCC: (GNU) 4.0.0" Example #7: A Simple Program Loader will put addresses of a and b in this memory location Declare storage for a and b

  22. Example #7: Object File example1.o: file format elf32-littlearm Disassembly of section .text: 00000000 <main>: 0: e1a0c00d mov ip, sp 4: e92dd800 stmdb sp!, {fp, ip, lr, pc} 8: e24cb004 sub fp, ip, #4 ; 0x4 c: e59f2014 ldr r2, [pc, #20] ; 28 <.text+0x28> 10: e3a03003 mov r3, #3 ; 0x3 14: e5823000 str r3, [r2] 18: e59f200c ldr r2, [pc, #12] ; 2c <.text+0x2c> 1c: e3a03004 mov r3, #4 ; 0x4 20: e5823000 str r3, [r2] 24: e89da800 ldmia sp, {fp, sp, pc} ... 28: R_ARM_ABS32 a 2c: R_ARM_ABS32 b

  23. Example #7: Executable File 00008208 <main>: 8208: e1a0c00d mov ip, sp 820c: e92dd800 stmdb sp!, {fp, ip, lr, pc} 8210: e24cb004 sub fp, ip, #4 ; 0x4 8214: e59f2014 ldr r2, [pc, #20] ; 8230 <.text+0x210> 8218: e3a03003 mov r3, #3 ; 0x3 821c: e5823000 str r3, [r2] 8220: e59f200c ldr r2, [pc, #12] ; 8234 <.text+0x214> 8224: e3a03004 mov r3, #4 ; 0x4 8228: e5823000 str r3, [r2] 822c: e89da800 ldmia sp, {fp, sp, pc} 8230: 0000adc4 andeq sl, r0, r4, asr #27 8234: 0000adc0 andeq sl, r0, r0, asr #27

  24. Example #8: Calling A Function int tmp; void swap(int a, int b); int main() { int a, b; a = 3; b = 4; swap(a, b); } /* end main() */ void swap(int a, int b) { tmp = a; a = b; b = tmp; } /* end swap() */

  25. main: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #8 mov r3, #3 str r3, [fp, #-20] mov r3, #4 str r3, [fp, #-16] ldr r0, [fp, #-20] ldr r1, [fp, #-16] bl swap sub sp, fp, #12 ldmfd sp, {fp, sp, pc} swap: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #8 str r0, [fp, #-16] str r1, [fp, #-20] ldr r2, .L5 ldr r3, [fp, #-16] str r3, [r2, #0] ldr r3, [fp, #-20] str r3, [fp, #-16] ldr r3, .L5 ldr r3, [r3, #0] str r3, [fp, #-20] sub sp, fp, #12 ldmfd sp, {fp, sp, pc} .L6: .align 2 .L5: .word tmp .comm tmp,4,4 Example #8: Assembly Listing

  26. Example #9: Manipulating Pointers int tmp; int *pa, *pb; void swap(int a, int b); int main() { int a, b; pa = &a; pb = &b; *pa = 3; *pb = 4; swap(*pa, *pb); } /* end main() */ void swap(int a, int b) { tmp = a; a = b; b = tmp; } /* end swap() */

  27. main: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #8 ldr r2, .L3 sub r3, fp, #16 str r3, [r2, #0] ldr r2, .L3+4 sub r3, fp, #20 str r3, [r2, #0] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #3 str r3, [r2, #0] ldr r3, .L3+4 ldr r2, [r3, #0] mov r3, #4 str r3, [r2, #0] ldr r3, .L3 ldr r3, [r3, #0] ldr r2, [r3, #0] ldr r3, .L3+4 ldr r3, [r3, #0] ldr r3, [r3, #0] mov r0, r2 mov r1, r3 bl swap sub sp, fp, #12 ldmfd sp, {fp, sp, pc} .L4: .align 2 .L3: .word pa .word pb Example #9: Assembly Listing

  28. typedef struct testStruct { unsigned int a; unsigned int b; char c; } testStruct; testStruct *ptest; int main() { ptest­>a = 4; ptest­>b = 10; ptest­>c = 'A'; } /* end main() */ main: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 ldr r3, .L3 ldr r2, [r3, #0] mov r3, #4 str r3, [r2, #0] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #10 str r3, [r2, #4] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #65 strb r3, [r2, #8] ldmfd sp, {fp, sp, pc} .L4: .align 2 .L3: .word ptest Example #10: Dealing with struct

  29. Example #11: Passing Arguments int tmp; void test(int a, int b, int c, int d, int *e); int main() { int a, b, c, d, e; a = 3; b = 4; c = 5; d = 6; e = 7; test(a, b, c, d, &e); } /* end main() */ void test(int a, int b, int c, int d, int *e) { tmp = a; a = b; b = tmp; c = b; b = d; *e = d; } /* end test() */

  30. main: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #24 mov r3, #3 str r3, [fp, #-28] mov r3, #4 str r3, [fp, #-24] mov r3, #5 str r3, [fp, #-20] mov r3, #6 str r3, [fp, #-16] mov r3, #7 str r3, [fp, #-32] sub r3, fp, #32 str r3, [sp, #0] ldr r0, [fp, #-28] ldr r1, [fp, #-24] ldr r2, [fp, #-20] ldr r3, [fp, #-16] bl test sub sp, fp, #12 ldmfd sp, {fp, sp, pc} Example #11: Assembly Listing1

  31. test: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #16 str r0, [fp, #-16] str r1, [fp, #-20] str r2, [fp, #-24] str r3, [fp, #-28] ldr r2, .L5 ldr r3, [fp, #-16] str r3, [r2, #0] ldr r3, [fp, #-20] str r3, [fp, #-16] ldr r3, .L5 ldr r3, [r3, #0] str r3, [fp, #-20] ldr r3, [fp, #-20] str r3, [fp, #-24] ldr r3, [fp, #-28] str r3, [fp, #-20] ldr r2, [fp, #4] ldr r3, [fp, #-28] str r3, [r2, #0] sub sp, fp, #12 ldmfd sp, {fp, sp, pc} .L6: .align 2 .L5: .word tmp Example #11: Assembly Listing2

  32. Interfacing C and Assembly • ARM has developed a standard called the “ARM Procedure Call Standard” (APCS) which defines: • constraints on the use of registers • stack conventions • format of a stack backtrace data structure • argument passing and result return • support for ARM shared library mechanism • Compiler­generated code conforms to the APCS • It's just a standard ­ not an architectural requirement • Cannot avoid standard when interfacing C and assembly code • Can avoid standard when just writing assembly code or when writing assembly code that isn't called by C code

  33. Register Names and Use Register # APCS Name APCS Role R0 a1 argument 1 R1 a2 argument 2 R2 a3 argument 3 R3 a4 argument 4 R4..R8 v1..v5 register variables R9 sb/v6 static base/register variable R10 sl/v7 stack limit/register variable R11 fp frame pointer R12 ip scratch reg/new­sb in inter­link­unit calls R13 sp low end of current stack frame R14 lr link address/scratch register R15 pc program counter

  34. STM sp!, {r0­r15} The ARM processor uses a bit-vector to represent each register to be saved The architecture places the lowest number register into the lowest address Default STM == STMDB == STMFD SPbefore SPafter How Does STM Work on Memory ? address 0x90 0x8c 0x88 0x84 0x80 0x7c 0x78 0x74 0x70 0x6c 0x68 0x64 0x60 0x5c 0x58 0x54 0x50 pc lr sp ip fp v7 v6 v5 v4 v3 v2 v1 a4 a3 a2 a1

  35. Passing and Returning Structures • Structures are usually passed in registers (and overflow onto the stack when necessary) • When a function returns a struct, a pointer to where the struct result is to be placed is passed in a1 (first argument) • Example struct s f(int x); ­­ is compiled as ­­ void f(struct s *result, int x);

  36. typedef struct two_ch_struct{ char ch1; char ch2; } two_ch; two_ch max(two_ch a, two_ch b){ return((a.ch1 > b.ch1)?a:b); } /* end max() */ max: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #12 str r0, [fp, #-24] str r1, [fp, #-16] str r2, [fp, #-20] ldrb r2, [fp, #-16] ldrb r3, [fp, #-20] cmp r2, r3 bls .L2 ldr r3, [fp, #-16] ldr r2, [fp, #-24] str r3, [r2, #0] b .L1 .L2: ldr r3, [fp, #-20] ldr r2, [fp, #-24] str r3, [r2, #0] .L1: ldr r0, [fp, #-24] sub sp, fp, #12 ldmfd sp, {fp, sp, pc} Example #12: Passing Structures

  37. Frame pointer (fp) points to the top of stack for function By using the frame pointer and storing it at the same offset for every function call, it creates a singly­linked list of activation records foo: mov ip,sp stmfd sp!,{a1­a3,fp,ip,lr,pc} sub fp,ip,#4 <computations go here> sub fp,fp,#12 ldmfd fp,{fp,sp,pc} ip fp sp The Frame Pointer address 0x90 0x8c 0x88 0x84 0x80 0x7c 0x78 0x74 0x70 pc lr ip fp a3 a2 a1

  38. Backtrace • The fp register points to the stack backtrace structure for the currently executing function. • The saved fp value is (zero or) a pointer to a stack backtrace structure created by the function which called the current function. • The saved fp value in this structure is a pointer to the stack backtrace structure for the function that called the function that called the current function; and so on back until the first function.

  39. SPbefore IPcurrent FPafter SPcurrent Creating the “Backtrace” Structure address 0x90 0x8c 0x88 0x84 0x80 0x7c 0x78 0x74 0x70 0x6c 0x68 0x64 0x60 0x5c 0x58 0x54 0x50 MOV ip, sp STMFD sp!,{a1­a4,v1­v7,fp,ip,sp,lr,pc} SUB fp, ip, #4 … … sub fp, fp, #16 LDMFD fp, {fp,sp,sb,pc} (saved) pc (saved) lr (saved) sp (saved) ip (saved) fp v7 v6 v5 v4 v3 v2 v1 a4 a3 a2 a1

  40. (saved) pc (saved) pc (saved) pc (saved) lr (saved) lr (saved) lr (saved)sp (saved)sp (saved)sp (saved)ip (saved)ip (saved)ip (saved) fp (saved) fp (saved) fp v7 v7 v7 v6 v6 v6 v5 v5 v5 v4 v4 v4 v3 v3 v3 v2 v2 v2 v1 v1 v1 a4 a4 a4 a3 a3 a3 a2 a2 a2 a1 a1 a1 Example Backtrace main’s frame foo’s frame bar’s frame fp

  41. Exercise #1 • Write an assembly subroutine that implements the quicksort algorithm to sort a list of unsigned integer values. • The first entry in the list is the list’s length. • void quickSort(unsigned int *list); Input Output list: 0x00000005 0x00000005 0xA356A101 0x09250037 0xE235C203 0x29567322 0x7A35B310 0x7A35B310 0x09250037 0xA356A101 0x29567322 0xE235C203

  42. Exercise #2 • Write an assembly subroutine that deletes an item from an ordered list of unsigned values if it is not already there. • The first entry in the list is the list’s length. • void removeItem(unsigned int item, unsigned int *list); Input Output item: 0x7A35B310 list: 0x00000005 0x00000004 0x09250037 0x09250037 0x29567322 0x29567322 0x7A35B310 0xA356A101 0xA356A101 0xE235C203 0xE235C203

  43. Outline • Assembly Programming • Assembly-C Interface • Peephole Optimization

  44. Peephole Optimization • Final pass over generated code: • Examine a few consecutive instructions: 2 to 4 • See if an obvious replacement is possible: store/load pairs MOV %eax => mema MOV mema => %eax • Can eliminate the second instruction without needing any global knowledge of mema • Use algebraic identities • Special-case individual instructions

  45. Algebraic Identities • Worth recognizing single instructions with a constant operand: • A * 2 = A + A • A * 1 = A • A * 0 = 0 • A / 1 = A • More delicate with floating-point

  46. Is this ever helpful? • Why would anyone write X * 1? • Why bother to correct such obvious junk code? • In fact one might write #define MAX_TASKS 1...a = b * MAX_TASKS; • Also, seemingly redundant code can be produced by other optimizations. This is an important effect.

  47. Replace Multiply by Shift • A := A * 4; • Can be replaced by 2-bit left shift (signed/unsigned) • But must worry about overflow if language does • A := A / 4; • If unsigned, can replace with shift right • But shift right arithmetic is a well-known problem • Language may allow it anyway (traditional C)

  48. Addition Chains for Multiplication • If multiply is very slow (or on a machine with no multiply instruction like the original SPARC), decomposing a constant operand into sum of powers of two can be effective: • X * 125 = x * 128 – x * 4 + x • Two shifts, one subtract and one add, which may be faster than one multiply • Note similarity with efficient exponentiation method

  49. The Right Shift Problem • Arithmetic Right shift: • Shift right and use sign bit to fill most significant bits • -5 111111...1111111011 • SAR 111111...1111111101 • Which is -3, not -2 • In most languages -5/2 = -2 • Prior to C99, implementations were allowed to truncate towards or away from zero if either operand was negative

  50. Folding Jumps to Jumps • A jump to an unconditional jump can copy the target address JNE lab1 ... lab1 JMP lab2 • Can be replaced by JNE lab2 • As a result, lab1 may become dead (unreferenced)

More Related