A: .float 2.0
.global B
B: .float 1.0
.global C
C: .float 0.0
.align 4
.global _main
_main:
ld f6, 256(r1)
ld f2, 260(r2)
multf f0,f2,f4
subf f8,f6,f2
divf f10,f0,f6
addd f6, f8, f2
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
trap #0
Execute tomasulo's simulator
(yueh) ~/dlxsim % dlxsim -TOMASULO
(dlxsim) stats hw
Memory size: 65536 bytes.
Tomasulo's Algorithm Hardware Configuration
3 add/subtract units, latency = 2 cycles
2 mult/div units, latency = 10 cycles (multiply)
latency = 40 cycles (divide)
6 load_units, latency = 2 cycles
3 store_units, latency = 2 cycles
Load sample code
(dlxsim) load ex3.s
Use step execution
(dlxsim) step
stopped after single step, pc = 0x4: nop
(dlxsim) step
stopped after single step, pc = 0x4: nop
(dlxsim) quit
(yueh) ~/dlxsim % dlxsim -TOMASULO
(dlxsim) stats hw
Memory size: 65536 bytes.
Tomasulo's Algorithm Hardware Configuration
3 add/subtract units, latency = 2 cycles
2 mult/div units, latency = 10 cycles (multiply)
latency = 40 cycles (divide)
6 load_units, latency = 2 cycles
3 store_units, latency = 2 cycles
(dlxsim) load ex3.s
(dlxsim) step _main
stopped after single step, pc = _main+0x4: ld f2,B(r2)
(dlxsim) step
stopped after single step, pc = _main+0x8: multf f0,f2,f4
(dlxsim) step
stopped after single step, pc = _main+0xc: subf f8,f6,f2
(dlxsim) step
stopped after single step, pc = _main+0x10: divf f10,f0,f6
(dlxsim) step
stopped after single step, pc = _main+0x14: addd f6,f8,f2
Print scoreboard content
(dlxsim) stats tomasulo
TOMASULO's 5 th clock cycle
Instruction Issue Execute Write Result
+============================================================================+
ld f6,A(r1) V V V
ld f2,B(r2) V V
multf f0,f2,f4 V
subf f8,f6,f2 V
divf f10,f0,f6 V
+============================================================================+
Name Busy Op Vj Vk Qj Qk
+=======================================================================+
add1 YES subf (f6) load2
add2 NO (null)
add3 NO (null)
mul1 YES multf (f4) load2
mul2 YES divf (f6) mul1
+=======================================================================+
F0 F2 F4 F6 F8 F10 F12 F14
+----------------------------------------------------------------------+
Qi mul1 load2 add1 mul2
Busy YES YES NO NO YES YES NO NO
+======================================================================+
F16 F18 F20 F22 F24 F26 F28 F30
+----------------------------------------------------------------------+
Busy NO NO NO NO NO NO NO NO
+======================================================================+
go to trap
(dlxsim) go
TRAP #0 received
Dump information
(dlxsim) stats tomasulo
TOMASULO's 54 th clock cycle
Instruction Issue Execute Write Result
+============================================================================+
ld f6,A(r1) V V V
ld f2,B(r2) V V V
multf f0,f2,f4 V V V
subf f8,f6,f2 V V V
divf f10,f0,f6 V
addd f6,f8,f2 V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
nop V V V
trap 0x0 V V V
nop V
+============================================================================+
Name Busy Op Vj Vk Qj Qk
+=======================================================================+
add1 NO (null)
add2 NO (null)
add3 NO (null)
mul1 NO (null)
mul2 NO divf (mul1) (f6)
+=======================================================================+
F0 F2 F4 F6 F8 F10 F12 F14
+----------------------------------------------------------------------+
Qi mul2
Busy NO NO NO NO NO YES NO NO
+======================================================================+
F16 F18 F20 F22 F24 F26 F28 F30
+----------------------------------------------------------------------+
Busy NO NO NO NO NO NO NO NO
+======================================================================+
Quit the program
(dlxsim) quit
Since we can't use integer operation in this simulator, there is a skill which we can load values for registers.
ld f6, 256(r1)
ld f2, 260(r2)
The 256 means the first data location. Because float needs 4 bytes to store, the next value is 260. The other choice is to use put or fput to store values, then use get or fget to look up values.