This site is no longer active and is available for archival purposes only. Registration and login is disabled.

Some help to optimize my new ATARI ST emu for PocketPC...


Postby schtruck » Oct 20, 2002 @ 12:24am

schtruck
pm Member
 
Posts: 77
Joined: May 24, 2002 @ 5:29pm


Postby refractor » Oct 20, 2002 @ 1:23am

User avatar
refractor
pm Insider
 
Posts: 2304
Joined: Feb 5, 2002 @ 1:12pm
Location: Luxembourg


Postby schtruck » Oct 20, 2002 @ 10:13am

ok as request here is the Code generated:

; 344 : r0 = *line_i++;

00638 e28b2004 add r2, r11, #4
0063c e59b1000 ldr r1, [r11]

; 345 : r1 = *line_i++;

00640 e592e000 ldr lr, [r2]
00644 e282b004 add r11, r2, #4

; 346 : r2=(r0>>1)&mask1;

00648 e00320a1 and r2, r3, r1, lsr #1

; 347 : r3=(r1>>1)&mask1;

0064c e00360ae and r6, r3, lr, lsr #1

; 348 : r0&=mask1;

00650 e0031001 and r1, r3, r1

; 349 : r1&=mask1;

00654 e003e00e and lr, r3, lr

; 350 :
; 351 : r2+=(r2>>15);

00658 e08227a2 add r2, r2, r2, lsr #15

; 352 : r3+=(r3>>15);

0065c e08667a6 add r6, r6, r6, lsr #15

; 353 : r0+=(r0>>15);

00660 e08197a1 add r9, r1, r1, lsr #15

; 354 : r1+=(r1>>15);

00664 e08ea7ae add r10, lr, lr, lsr #15

; 355 :
; 356 : r4=(r2>>2)&mask2;
; 357 : r5=(r3>>2)&mask2;
; 358 : r2&=mask2;
; 359 : r3&=mask2;
; 360 : r4+=(r5<<2);

00668 e000e122 and lr, r0, r2, lsr #2
0066c e0001126 and r1, r0, r6, lsr #2
00670 e08e1101 add r1, lr, r1, lsl #2
00674 e0002002 and r2, r0, r2
00678 e000e006 and lr, r0, r6

; 361 : r2+=(r3<<2);

0067c e082210e add r2, r2, lr, lsl #2

; 362 :
; 363 : r3=(r0>>2)&mask2;
; 364 : r5=(r1>>2)&mask2;
; 365 : r0&=mask2;
; 366 : r1&=mask2;
; 367 : r3+=(r5<<2);

00680 e000e12a and lr, r0, r10, lsr #2
00684 e000a00a and r10, r0, r10
00688 e0006129 and r6, r0, r9, lsr #2
0068c e0009009 and r9, r0, r9
00690 e086e10e add lr, r6, lr, lsl #2

; 368 : r0+=(r1<<2);

00694 e089610a add r6, r9, r10, lsl #2

; 369 :
; 370 : *--line_o1 = hicolor[(r4&240)+((r3>>4)&15)];

00698 e1a0a22e mov r10, lr, lsr #4
0069c e20aa00f and r10, r10, #0xF ; 0xF = 15
006a0 e20190f0 and r9, r1, #0xF0 ; 0xF0 = 240
006a4 e08aa009 add r10, r10, r9
006a8 e794a10a ldr r10, [r4, +r10, lsl #2]
006ac e2477004 sub r7, r7, #4

; 371 : *--line_o1 = hicolor[(r2&240)+((r0>>4)&15)];

006b0 e20290f0 and r9, r2, #0xF0 ; 0xF0 = 240
006b4 e587a000 str r10, [r7]
006b8 e1a0a226 mov r10, r6, lsr #4
006bc e20aa00f and r10, r10, #0xF ; 0xF = 15
006c0 e08aa009 add r10, r10, r9
006c4 e794a10a ldr r10, [r4, +r10, lsl #2]
006c8 e2477004 sub r7, r7, #4

; 372 : *--line_o1 = hicolor[((r4<<4)&240)+(r3&15)];

006cc e20e900f and r9, lr, #0xF ; 0xF = 15
006d0 e587a000 str r10, [r7]
006d4 e201a00f and r10, r1, #0xF ; 0xF = 15
006d8 e089a20a add r10, r9, r10, lsl #4
006dc e794a10a ldr r10, [r4, +r10, lsl #2]
006e0 e2477004 sub r7, r7, #4

; 373 : *--line_o1 = hicolor[((r2<<4)&240)+(r0&15)];

006e4 e206900f and r9, r6, #0xF ; 0xF = 15
006e8 e587a000 str r10, [r7]
006ec e202a00f and r10, r2, #0xF ; 0xF = 15
006f0 e089a20a add r10, r9, r10, lsl #4
006f4 e794a10a ldr r10, [r4, +r10, lsl #2]
006f8 e2477004 sub r7, r7, #4

; 374 : *--line_o1 = hicolor[((r4>>8)&240)+((r3>>12)&15)];

006fc e1a0962e mov r9, lr, lsr #12
00700 e587a000 str r10, [r7]
00704 e1a0a421 mov r10, r1, lsr #8
00708 e20aa0f0 and r10, r10, #0xF0 ; 0xF0 = 240
0070c e209900f and r9, r9, #0xF ; 0xF = 15
00710 e08aa009 add r10, r10, r9
00714 e794a10a ldr r10, [r4, +r10, lsl #2]
00718 e2477004 sub r7, r7, #4

; 375 : *--line_o1 = hicolor[((r2>>8)&240)+((r0>>12)&15)];

0071c e1a09626 mov r9, r6, lsr #12
00720 e587a000 str r10, [r7]
00724 e1a0a422 mov r10, r2, lsr #8
00728 e20aa0f0 and r10, r10, #0xF0 ; 0xF0 = 240
0072c e209900f and r9, r9, #0xF ; 0xF = 15

; 376 : *--line_o1 = hicolor[((r4>>4)&240)+((r3>>8)&15)];

00730 e1a01221 mov r1, r1, lsr #4
00734 e08aa009 add r10, r10, r9
00738 e1a0e42e mov lr, lr, lsr #8
0073c e794a10a ldr r10, [r4, +r10, lsl #2]
00740 e20110f0 and r1, r1, #0xF0 ; 0xF0 = 240
00744 e20ee00f and lr, lr, #0xF ; 0xF = 15
00748 e2477004 sub r7, r7, #4
0074c e081100e add r1, r1, lr
00750 e587a000 str r10, [r7]
00754 e7941101 ldr r1, [r4, +r1, lsl #2]
00758 e2477004 sub r7, r7, #4

; 377 : *--line_o1 = hicolor[((r2>>4)&240)+((r0>>8)&15)];

0075c e1a02222 mov r2, r2, lsr #4
00760 e5871000 str r1, [r7]
00764 e1a01426 mov r1, r6, lsr #8
00768 e20220f0 and r2, r2, #0xF0 ; 0xF0 = 240
0076c e201100f and r1, r1, #0xF ; 0xF = 15
00770 e0822001 add r2, r2, r1
00774 e7942102 ldr r2, [r4, +r2, lsl #2]
00778 e2477004 sub r7, r7, #4
0077c e2488001 sub r8, r8, #1
00780 e5872000 str r2, [r7]
00784 e3580000 cmp r8, #0
00788 8affffaa bhi |$L32010| ; 00000638
schtruck
pm Member
 
Posts: 77
Joined: May 24, 2002 @ 5:29pm


Postby refractor » Oct 20, 2002 @ 10:24am

Can you generate it as "assembly with source code"?

Cheers,

Ref.
User avatar
refractor
pm Insider
 
Posts: 2304
Joined: Feb 5, 2002 @ 1:12pm
Location: Luxembourg


Postby schtruck » Oct 20, 2002 @ 11:03am

; 344 : r0 = *line_i++;

add r2, r11, #4
ldr r1, [r11]

; 345 : r1 = *line_i++;

ldr lr, [r2]
add r11, r2, #4

; 346 : r2=(r0>>1)&mask1;

and r2, r3, r1, lsr #1

; 347 : r3=(r1>>1)&mask1;

and r6, r3, lr, lsr #1

; 348 : r0&=mask1;

and r1, r3, r1

; 349 : r1&=mask1;

and lr, r3, lr

; 350 :
; 351 : r2+=(r2>>15);

add r2, r2, r2, lsr #15

; 352 : r3+=(r3>>15);

add r6, r6, r6, lsr #15

; 353 : r0+=(r0>>15);

add r9, r1, r1, lsr #15

; 354 : r1+=(r1>>15);

add r10, lr, lr, lsr #15

; 355 :
; 356 : r4=(r2>>2)&mask2;
; 357 : r5=(r3>>2)&mask2;
; 358 : r2&=mask2;
; 359 : r3&=mask2;
; 360 : r4+=(r5<<2);

and lr, r0, r2, lsr #2
and r1, r0, r6, lsr #2
add r1, lr, r1, lsl #2
and r2, r0, r2
and lr, r0, r6

; 361 : r2+=(r3<<2);

add r2, r2, lr, lsl #2

; 362 :
; 363 : r3=(r0>>2)&mask2;
; 364 : r5=(r1>>2)&mask2;
; 365 : r0&=mask2;
; 366 : r1&=mask2;
; 367 : r3+=(r5<<2);

and lr, r0, r10, lsr #2
and r10, r0, r10
and r6, r0, r9, lsr #2
and r9, r0, r9
add lr, r6, lr, lsl #2

; 368 : r0+=(r1<<2);

add r6, r9, r10, lsl #2

; 369 :
; 370 : *--line_o1 = hicolor[(r4&240)+((r3>>4)&15)];

mov r10, lr, lsr #4
and r10, r10, #0xF ; 0xF = 15
and r9, r1, #0xF0 ; 0xF0 = 240
add r10, r10, r9
ldr r10, [r4, +r10, lsl #2]
sub r7, r7, #4

; 371 : *--line_o1 = hicolor[(r2&240)+((r0>>4)&15)];

and r9, r2, #0xF0 ; 0xF0 = 240
str r10, [r7]
mov r10, r6, lsr #4
and r10, r10, #0xF ; 0xF = 15
add r10, r10, r9
ldr r10, [r4, +r10, lsl #2]
sub r7, r7, #4

; 372 : *--line_o1 = hicolor[((r4<<4)&240)+(r3&15)];

and r9, lr, #0xF ; 0xF = 15
str r10, [r7]
and r10, r1, #0xF ; 0xF = 15
add r10, r9, r10, lsl #4
ldr r10, [r4, +r10, lsl #2]
sub r7, r7, #4

; 373 : *--line_o1 = hicolor[((r2<<4)&240)+(r0&15)];

and r9, r6, #0xF ; 0xF = 15
str r10, [r7]
and r10, r2, #0xF ; 0xF = 15
add r10, r9, r10, lsl #4
ldr r10, [r4, +r10, lsl #2]
sub r7, r7, #4

; 374 : *--line_o1 = hicolor[((r4>>8)&240)+((r3>>12)&15)];

mov r9, lr, lsr #12
str r10, [r7]
mov r10, r1, lsr #8
and r10, r10, #0xF0 ; 0xF0 = 240
and r9, r9, #0xF ; 0xF = 15
add r10, r10, r9
ldr r10, [r4, +r10, lsl #2]
sub r7, r7, #4

; 375 : *--line_o1 = hicolor[((r2>>8)&240)+((r0>>12)&15)];

mov r9, r6, lsr #12
str r10, [r7]
mov r10, r2, lsr #8
and r10, r10, #0xF0 ; 0xF0 = 240
and r9, r9, #0xF ; 0xF = 15

; 376 : *--line_o1 = hicolor[((r4>>4)&240)+((r3>>8)&15)];

mov r1, r1, lsr #4
add r10, r10, r9
mov lr, lr, lsr #8
ldr r10, [r4, +r10, lsl #2]
and r1, r1, #0xF0 ; 0xF0 = 240
and lr, lr, #0xF ; 0xF = 15
sub r7, r7, #4
add r1, r1, lr
str r10, [r7]
ldr r1, [r4, +r1, lsl #2]
sub r7, r7, #4

; 377 : *--line_o1 = hicolor[((r2>>4)&240)+((r0>>8)&15)];

mov r2, r2, lsr #4
str r1, [r7]
mov r1, r6, lsr #8
and r2, r2, #0xF0 ; 0xF0 = 240
and r1, r1, #0xF ; 0xF = 15
add r2, r2, r1
ldr r2, [r4, +r2, lsl #2]
sub r7, r7, #4
sub r8, r8, #1
str r2, [r7]
cmp r8, #0
bhi |$L32010| ; 00000638
ldr r1, [sp, #0x20] ; 0x20 = 32
ldr r2, [sp, #0x14] ; 0x14 = 20
sub r1, r1, #0xA, 26 ; 0x280 = 640
str r1, [sp, #0x20] ; 0x20 = 32
add r2, r2, #2
ldr r1, [sp, #0x18] ; 0x18 = 24
str r2, [sp, #0x14] ; 0x14 = 20
ldr lr, [r1, #4]
ldr r1, [r1]
add r1, lr, r1
cmp r2, r1
blt |$L32006| ; 00000628[code][/code]
schtruck
pm Member
 
Posts: 77
Joined: May 24, 2002 @ 5:29pm


Postby schtruck » Oct 20, 2002 @ 11:42am

Here is a part which need to be optimized, this is the Reading and Writing of Big endian ST memory.

first there is this:

#define ReadW(addr) ((*(uint16 *)(addr) << 8) | (*(uint16 *)(addr) >> 8))

#define ReadL(address) ((uint16) ReadW(address) << 16) | (uint16) ReadW((address) + 2)

#define WriteW(addr,value) *((int16 *)(addr)) = ((((uint16)(value)) << 8) | (((uint16)(value)) >> 8))
#define WriteL(address,value) WriteW((address) + 2, value); WriteW(address, (value) >> 16)


and then here is the use of those define:

; 213 : {

|$M1495|

; 214 : return ReadW(address + membase);

ldr r1, [pc, #0x1C] ; pc+8+28 = 00000024
ldr r1, [r1]
ldrh r0, [r1, +r0]
mov r3, r0, lsl #8
mov r3, r3, lsl #16
mov r3, r3, lsr #16
mov r2, r0, lsl #16
orr r0, r3, r2, lsr #24

; 215 : }



; 218 : {

|$M1500|

; 219 : return ReadL(address + membase);

ldr r1, [pc, #0x30] ; pc+8+48 = 00000038
ldr r1, [r1]
ldrh r0, [r1, +r0]!
ldrh r1, [r1, #2]
mov r3, r0, lsl #8
mov r3, r3, lsl #16
mov r2, r1, lsl #8
mov r2, r2, lsl #16
mov r3, r3, lsr #16
mov r2, r2, lsr #16
orr r3, r3, r0, lsr #8
orr r2, r2, r1, lsr #8
orr r0, r2, r3, lsl #16

; 220 : }




; 173 : WriteW(address + membase, value);

mov r3, r4, lsl #16
ldr r0, [pc, #0x20] ; pc+8+32 = 00000058
mov r3, r3, asr #16
mov r3, r3, lsl #8
ldr r0, [r0]
mov r3, r3, lsl #16
mov r3, r3, asr #16
mov r2, r4, lsl #16
orr r3, r3, r2, lsr #24
strh r3, [r0, +r5]

; 174 : }



; 183 : WriteL(address + membase, value);

mov r3, r4, lsl #16
ldr r0, [pc, #0x44] ; pc+8+68 = 0000007C
mov r3, r3, lsr #16
mov r2, r4, lsl #8
ldr r1, [r0]
mov r3, r3, lsr #8
mov r2, r2, lsl #16
orr r3, r3, r2, asr #16
add r2, r1, r5
mov r1, r4, lsr #16
strh r3, [r2, #2]
mov r3, r1, lsl #16
ldr r0, [r0]
mov r3, r3, lsr #16
mov r2, r1, lsl #8
mov r3, r3, lsr #8
mov r2, r2, lsl #16
orr r3, r3, r2, asr #16
strh r3, [r0, +r5]

there is surely a way to optimize this conversion of Big Endian ----> Little Endian? no?
schtruck
pm Member
 
Posts: 77
Joined: May 24, 2002 @ 5:29pm


Postby refractor » Oct 20, 2002 @ 12:17pm

I had a quick look at the generated code for the screen thing, and it isn't all that bad. You could combine some instructions (especially with the loads and stores), but you probably won't gain huge amounts of speed. It might be better to just leave it as portable C in this case, unless the speed is still not fast enough for 100% speed emulation.

Cheers,

Ref.
User avatar
refractor
pm Insider
 
Posts: 2304
Joined: Feb 5, 2002 @ 1:12pm
Location: Luxembourg


Postby schtruck » Oct 20, 2002 @ 12:24pm

Speed is fast enought for drawing now, what we could optimize now is the Little ENdian conversion, because i think this last Take too more time.

Btw most of game are playable now, speed is between 60 and 100 percent of original speed.

That never pass 100% because it's speed limited....

i think if Little Endian conversion can be optimized, Emulation will be boosted a lot of....
schtruck
pm Member
 
Posts: 77
Joined: May 24, 2002 @ 5:29pm


Postby refractor » Oct 20, 2002 @ 1:24pm

What code does this produce?

#define ReadW(addr) ((*(uint8 *)(addr)<<8)|((*(uchar *)(addr+1))

(Rather than loading a 16-bit word and splitting, loading the individual bytes should be faster).
User avatar
refractor
pm Insider
 
Posts: 2304
Joined: Feb 5, 2002 @ 1:12pm
Location: Luxembourg


Postby schtruck » Oct 20, 2002 @ 2:50pm

before:

; 213 : return ReadW(address + membase);

ldr r1, [pc, #0x1C] ; pc+8+28 = 00000024
ldr r1, [r1]
add r2, r1, r0
ldrb r3, [r2, #1]
ldrb r2, [r1, +r0]
orr r3, r3, r2, lsl #8
mov r3, r3, lsl #16
mov r0, r3, lsr #16
Now:


; 214 : return ReadW(address + membase);

ldr r1, [pc, #0x1C] ; pc+8+28 = 00000024
ldr r1, [r1]
ldrh r0, [r1, +r0]
mov r3, r0, lsl #8
mov r3, r3, lsl #16
mov r3, r3, lsr #16
mov r2, r0, lsl #16
orr r0, r3, r2, lsr #24
schtruck
pm Member
 
Posts: 77
Joined: May 24, 2002 @ 5:29pm


Postby schtruck » Oct 20, 2002 @ 3:00pm

What i tried before was to use Intrinsic Rotr but that didn't worked.

In fact a simple RoRw 8 would work no?
But i don't know how to implement it.
schtruck
pm Member
 
Posts: 77
Joined: May 24, 2002 @ 5:29pm


Postby refractor » Oct 20, 2002 @ 4:05pm

User avatar
refractor
pm Insider
 
Posts: 2304
Joined: Feb 5, 2002 @ 1:12pm
Location: Luxembourg


Postby Dave H » Oct 20, 2002 @ 4:19pm

Dave H.
Lead Programmer (Repton PPC/7650)
[url=http://www.handango.com/PlatformProductDetail.jsp?productId=43741]
Buy Repton Online here!
[/url]
User avatar
Dave H
pm Member
 
Posts: 164
Joined: Oct 3, 2002 @ 5:01pm


Postby AndrewGower » Oct 20, 2002 @ 6:58pm

ok I've taken a look at the endian conversion thing


before you had:
----
ReadW(addr) ((*(uint16 *)(addr) << 8) | (*(uint16 *)(addr) >> 8))

which due to the compiler becoming unbeliveably stupid unfortunately compiled to:
add r2, r1, r0
ldrb r3, [r2, #1]
ldrb r2, [r1, +r0]
orr r3, r3, r2, lsl #8
mov r3, r3, lsl #16
mov r0, r3, lsr #16
------

Refract suggested the following - which seem to compile slightly better

-----
#define ReadW(addr) ((*(uint8 *)(addr)<<8)|((*(uchar *)(addr+1))

ldrh r0, [r1, +r0]
mov r3, r0, lsl #8
mov r3, r3, lsl #16
mov r3, r3, lsr #16
mov r2, r0, lsl #16
orr r0, r3, r2, lsr #24
------

however what we really want is, the following assembler

-----
ldrh r0, [r1, +r0] ;load word wrong way around
mov r2, r0, lsr #8 ;swap top half into r2
mov r3, r0, lsl #24 ;swap bottom half into r3
orr r0, r2, r3, lsr #16 ;combine parts together
-----

unfortunately the compiler seems too stupid to produce this,
so I think you're going to have to put this bit straight
in assembler, or find a better compiler :-)


Btw: just storing the memory byte swapped won't work.
Whilst 68000 word accesses are restricted on word boundaries, there
is no such restriction on byte acceses, so if you did that you'd hav
to reprogram all your byte accesses to compensate instead :-)
AndrewGower
pm Member
 
Posts: 16
Joined: Oct 19, 2002 @ 10:29am


Postby schtruck » Oct 20, 2002 @ 10:06pm

schtruck
pm Member
 
Posts: 77
Joined: May 24, 2002 @ 5:29pm


PreviousNext

Return to Windows Mobile


Sort


Forum Description

A discussion forum for mobile device developers on the Windows Mobile platform. Any platform specific topics are welcome.

Moderators:

Dan East, sponge, Digby, David Horn, Kevin Gelso, RICoder

Forum permissions

You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot post attachments in this forum