Ticket #8688: blitters.s

File blitters.s, 9.0 KB (added by SF/robinwatts, 17 years ago)

First attempt at optimised blitter/scaler

Line 
1@ ScummVM Scumm Interpreter
2@ Copyright (C) 2007 The ScummVM project
3@
4@ This program is free software@ you can redistribute it and/or
5@ modify it under the terms of the GNU General Public License
6@ as published by the Free Software Foundation@ either version 2
7@ of the License, or (at your option) any later version.
8@
9@ This program is distributed in the hope that it will be useful,
10@ but WITHOUT ANY WARRANTY; without even the implied warranty of
11@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12@ GNU General Public License for more details.
13@
14@ You should have received a copy of the GNU General Public License
15@ along with this program@ if not, write to the Free Software
16@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17@
18@ $URL: $
19@ $Id: $
20@
21@ @author Robin Watts (robin@wss.co.uk)
22
23 .text
24
25 .global asmDrawStripToScreen
26 .global asmCopy8Col
27 .global Rescale_320x256xPAL8_To_256x256x1555
28 .global Rescale_320x256x1555_To_256x256x1555
29
30 @ ARM implementation of asmDrawStripToScreen.
31 @
32 @ C prototype would be:
33 @
34 @ extern "C" void asmDrawStripToScreen(int height,
35 @ int width,
36 @ byte const *text,
37 @ byte const *src,
38 @ byte *dst,
39 @ int vsPitch,
40 @ int vsScreenWidth,
41 @ int textSurfacePitch);
42 @
43 @ In addition, we assume that text, src and dst are all word (4 byte)
44 @ aligned. This is the same assumption that the old 'inline' version
45 @ made.
46asmDrawStripToScreen:
47 @ r0 = height
48 @ r1 = width
49 @ r2 = text
50 @ r3 = src
51 MOV r12,r13
52 STMFD r13!,{r4-r7,r9-r11,R14}
53 LDMIA r12,{r4,r5,r6,r7}
54 @ r4 = dst
55 @ r5 = vsPitch
56 @ r6 = vmScreenWidth
57 @ r7 = textSurfacePitch
58
59 CMP r0,#0 @ If height<=0
60 MOVLE r0,#1 @ height=1
61 CMP r1,#4 @ If width<4
62 BLT end @ return
63
64 @ Width &= ~4 ? What's that about then? Width &= ~3 I could have
65 @ understood...
66 BIC r1,r1,#4
67
68 SUB r5,r5,r1 @ vsPitch -= width
69 SUB r6,r6,r1 @ vmScreenWidth -= width
70 SUB r7,r7,r1 @ textSurfacePitch -= width
71 MOV r10,#253
72 ORR r10,r10,r10,LSL #8
73 ORR r10,r10,r10,LSL #16 @ r10 = mask
74yLoop:
75 MOV r14,r1 @ r14 = width
76xLoop:
77 LDR r12,[r2],#4 @ r12 = [text]
78 LDR r11,[r3],#4 @ r11 = [src]
79 CMP r12,r10
80 BNE singleByteCompare
81 SUBS r14,r14,#4
82 STR r11,[r4], #4 @ r4 = [dst]
83 BGT xLoop
84
85 ADD r2,r2,r7 @ text += textSurfacePitch
86 ADD r3,r3,r5 @ src += vsPitch
87 ADD r4,r4,r6 @ dst += vmScreenWidth
88 SUBS r0,r0,#1
89 BGT yLoop
90 LDMFD r13!,{r4-r7,r9-r11,PC}
91
92singleByteCompare:
93 MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
94 CMP r9,r10,LSR #24 @ if (r9 == mask)
95 MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
96 ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
97
98 MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
99 CMP r9,r10,LSR #24 @ if (r9 == mask)
100 MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
101 ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
102
103 MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
104 CMP r9,r10,LSR #24 @ if (r9 == mask)
105 MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
106 ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
107
108 MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
109 CMP r9,r10,LSR #24 @ if (r9 == mask)
110 MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
111 ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
112
113 STR r12,[r4],#4
114 SUBS r14,r14,#4
115 BGT xLoop
116
117 ADD r2,r2,r7 @ text += textSurfacePitch
118 ADD r3,r3,r5 @ src += vsPitch
119 ADD r4,r4,r6 @ dst += vmScreenWidth
120 SUBS r0,r0,#1
121 BGT yLoop
122end:
123 LDMFD r13!,{r4-r7,r9-r11,PC}
124
125
126 @ ARM implementation of asmCopy8Col
127 @
128 @ C prototype would be:
129 @
130 @ extern "C" void asmCopy8Col(byte *dst,
131 @ int dstPitch,
132 @ const byte *src,
133 @ int height);
134 @
135 @ In addition, we assume that src and dst are both word (4 byte)
136 @ aligned. This is the same assumption that the old 'inline' version
137 @ made.
138asmCopy8Col:
139 @ r0 = dst
140 @ r1 = dstPitch
141 @ r2 = src
142 @ r3 = height
143 STMFD r13!,{r14}
144 SUB r1,r1,#4
145
146 TST r3,#1
147 ADDNE r3,r3,#1
148 BNE roll2
149yLoop2:
150 LDR r12,[r2],#4
151 LDR r14,[r2],r1
152 STR r12,[r0],#4
153 STR r14,[r0],r1
154roll2:
155 LDR r12,[r2],#4
156 LDR r14,[r2],r1
157 SUBS r3,r3,#2
158 STR r12,[r0],#4
159 STR r14,[r0],r1
160 BNE yLoop2
161
162 LDMFD r13!,{PC}
163
164
165 @ ARM implementation of Rescale_320x256x1555_To_256x256x1555
166 @
167 @ C prototype would be:
168 @
169 @ extern "C" void Rescale_320x256x1555_To_256x256x1555(
170 @ u16 *dst,
171 @ const u16 *src,
172 @ int dstStride,
173 @ int srcStride);
174Rescale_320x256x1555_To_256x256x1555:
175 @ r0 = dst
176 @ r1 = src
177 @ r2 = dstStride
178 @ r3 = srcStride
179 STMFD r13!,{r4-r5,r8-r11,r14}
180
181 SUB r2,r2,#64*5 @ srcStride -= line length
182 SUB r3,r3,#64*4 @ dstStride -= line length
183
184 MOV r8, #0x0000001F
185 ORR r8, r8,#0x00007C00
186 ORR r8, r8,#0x03E00000 @ r8 = mask
187 MOV r5, #200 @ r5 = y
188yLoop3:
189 MOV r4, #64 @ r4 = x
190xLoop3:
191 LDRH r9, [r0],#2 @ r9 = src0
192 LDRH r10,[r0],#2 @ r10= src1
193 LDRH r11,[r0],#2 @ r11= src2
194 LDRH r12,[r0],#2 @ r12= src3
195 LDRH r14,[r0],#2 @ r14= src4
196
197 ORR r9, r9, r9, LSL #16 @ r9 = src0 | src0
198 ORR r10,r10,r10,LSL #16 @ r10= src1 | src1
199 ORR r11,r11,r11,LSL #16 @ r11= src2 | src2
200 ORR r12,r12,r12,LSL #16 @ r12= src3 | src3
201 ORR r14,r14,r14,LSL #16 @ r13= src4 | src4
202
203 AND r9, r9, r8 @ r9 = 0 | G0 | 0 | B0 | 0 | R0
204 AND r10,r10,r8 @ r10= 0 | G1 | 0 | B1 | 0 | R1
205 AND r11,r11,r8 @ r11= 0 | G2 | 0 | B2 | 0 | R2
206 AND r12,r12,r8 @ r12= 0 | G3 | 0 | B3 | 0 | R3
207 AND r14,r14,r8 @ r14= 0 | G4 | 0 | B4 | 0 | R4
208
209 ADD r9, r9, r9, LSL #1 @ r9 = 3*src0
210 ADD r9, r9, r10 @ r9 = dst0<<2
211 ADD r10,r10,r11 @ r10= dst1
212 ADD r11,r11,r12 @ r11= dst2
213 ADD r12,r12,r14 @ r12= src3 + src4
214 ADD r12,r12,r14,LSL #1 @ r12= src3 + src4*3 = dst3<<2
215
216 AND r9, r8, r9, LSR #2 @ r9 = dst0 (split)
217 AND r10,r8, r10,LSR #1 @ r10= dst1 (split)
218 AND r11,r8, r11,LSR #1 @ r11= dst2 (split)
219 AND r12,r8, r12,LSR #2 @ r12= dst3 (split)
220
221 ORR r9, r9, r9, LSR #16 @ r9 = dst0
222 ORR r10,r10,r10,LSR #16 @ r10= dst1
223 ORR r11,r11,r11,LSR #16 @ r11= dst2
224 ORR r12,r12,r12,LSR #16 @ r12= dst3
225
226 ORR r9, r9, #0x8000
227 ORR r10,r10,#0x8000
228 ORR r11,r11,#0x8000
229 ORR r12,r12,#0x8000
230
231 STRH r9, [r1],#2
232 STRH r10,[r1],#2
233 STRH r11,[r1],#2
234 STRH r12,[r1],#2
235
236 SUBS r4,r4,#1
237 BGT xLoop3
238
239 ADD r0,r0,r2,LSL #1
240 ADD r1,r2,r3,LSL #1
241 SUBS r5,r5,#1
242 BGT yLoop3
243
244 LDMFD r13!,{r4-r5,r8-r11,PC}
245
246 @ ARM implementation of Rescale_320x256xPAL8_To_256x256x1555
247 @
248 @ C prototype would be:
249 @
250 @ extern "C" void Rescale_320x256xPAL8_To_256x256x1555(
251 @ u16 *dst,
252 @ const u8 *src,
253 @ int dstStride,
254 @ int srcStride,
255 @ const u16 *pal);
256 @
257 @ This is a slight reordering of the params from the existing C one.
258 @ Sorry, but it makes the code easier.
259Rescale_320x256xPAL8_To_256x256x1555:
260 @ r0 = dst
261 @ r1 = src
262 @ r2 = dstStride
263 @ r3 = srcStride
264 STMFD r13!,{r4-r5,r8-r11,r14}
265 MOV r8, #0x0000001F
266 ORR r8, r8,#0x00007C00
267 ORR r8, r8,#0x03E00000 @ r8 = mask
268 LDR r9, [r13,#7*4] @ r9 = palette
269
270 SUB r13,r13,#256*4 @ r13 = 1K of space on the stack.
271 MOV r5, r13 @ r5 points to this space
272 MOV r14,#256
273palLoop:
274 LDRH r10,[r9],#2 @ r10 = palette entry
275 SUBS r14,r14,#1
276 ORR r10,r10,r10,LSL #16
277 AND r10,r10,r8 @ r10 = separated palette entry
278 STR r10,[r5], #4
279 BGT palLoop
280
281 SUB r2,r2,#64*5 @ srcStride -= line length
282 SUB r3,r3,#64*4 @ dstStride -= line length
283
284 MOV r5,#200 @ r5 = y
285yLoop4:
286 MOV r4,#64 @ r4 = x
287xLoop4:
288 LDRB r9, [r0],#1 @ r9 = src0
289 LDRB r10,[r0],#1 @ r10= src1
290 LDRB r11,[r0],#1 @ r11= src2
291 LDRB r12,[r0],#1 @ r12= src3
292 LDRB r14,[r0],#1 @ r14= src4
293
294 LDR r9, [r13,r9, LSL #2] @ r9 = pal[src0]
295 LDR r10,[r13,r10,LSL #2] @ r10= pal[src1]
296 LDR r11,[r13,r11,LSL #2] @ r11= pal[src2]
297 LDR r12,[r13,r12,LSL #2] @ r12= pal[src3]
298 LDR r14,[r13,r14,LSL #2] @ r13= pal[src4]
299
300 ADD r9, r9, r9, LSL #1 @ r9 = 3*src0
301 ADD r9, r9, r10 @ r9 = dst0<<2
302 ADD r10,r10,r11 @ r10= dst1
303 ADD r11,r11,r12 @ r11= dst2
304 ADD r12,r12,r14 @ r12= src3 + src4
305 ADD r12,r12,r14,LSL #1 @ r12= src3 + src4*3 = dst3<<2
306
307 AND r9, r8, r9, LSR #2 @ r9 = dst0 (split)
308 AND r10,r8, r10,LSR #1 @ r10= dst1 (split)
309 AND r11,r8, r11,LSR #1 @ r11= dst2 (split)
310 AND r12,r8, r12,LSR #2 @ r12= dst3 (split)
311
312 ORR r9, r9, r9, LSR #16 @ r9 = dst0
313 ORR r10,r10,r10,LSR #16 @ r10= dst1
314 ORR r11,r11,r11,LSR #16 @ r11= dst2
315 ORR r12,r12,r12,LSR #16 @ r12= dst3
316
317 ORR r9, r9, #0x8000
318 ORR r10,r10,#0x8000
319 ORR r11,r11,#0x8000
320 ORR r12,r12,#0x8000
321
322 STRH r9, [r1],#2
323 STRH r10,[r1],#2
324 STRH r11,[r1],#2
325 STRH r12,[r1],#2
326
327 SUBS r4,r4,#1
328 BGT xLoop4
329
330 ADD r0,r0,r2
331 ADD r1,r2,r3,LSL #1
332 SUBS r5,r5,#1
333 BGT yLoop4
334
335 ADD r13,r13,#256*4
336
337 LDMFD r13!,{r4-r5,r8-r11,PC}