汇编代码片段 - vfdff的博客

作者在 2012-08-05 10:56:14 发布以下内容

# xt-xcc::8.0.2

 #-----------------------------------------------------------
 # Compiling F4.c (/tmp/cc0M#4b60948d.ijEhEV)
 #-----------------------------------------------------------

 #-----------------------------------------------------------
 # Options:
 #-----------------------------------------------------------
 # Target:xtensa, ISA:xtensa, Pointer Size:32
 # -O2 (Optimization level)
 # -g0 (Debug level)
 # -m2 (Report advisories)
 #-----------------------------------------------------------

 .file "F4.c"
 .file 1 "/../benchmark/huaweibench/8.case/F4.c"

 .text
 .align 8

 .section .bss, "wa"
 .org 0x0
 .align 16
 .global C00000684
 .type C00000684, @object
 .size C00000684, 4
C00000684: # 0x0
 .skip 4
 .org 0x10
 .align 16
 .global C0000067F
 .type C0000067F, @object
 .size C0000067F, 153600
C0000067F: # 0x10
 .skip 153600
 .org 0x25810
 .align 16
 .global C00000686
 .type C00000686, @object
 .size C00000686, 400
C00000686: # 0x25810
 .skip 400
 .org 0x259a0
 .align 16
 .global C00000680
 .type C00000680, @object
 .size C00000680, 76800
C00000680: # 0x259a0
 .skip 76800
 .org 0x385a0
 .align 16
 .global C00000681
 .type C00000681, @object
 .size C00000681, 9600
C00000681: # 0x385a0
 .skip 9600
 .org 0x3ab20
 .align 16
 .global C00000682
 .type C00000682, @object
 .size C00000682, 200
C00000682: # 0x3ab20
 .skip 200
 .org 0x3abf0
 .align 16
 .global C00000683
 .type C00000683, @object
 .size C00000683, 1024
C00000683: # 0x3abf0
 .skip 1024
 .org 0x3aff0
 .align 16
 .global C00000685
 .type C00000685, @object
 .size C00000685, 12800
C00000685: # 0x3aff0
 .skip 12800

 .text
 .literal_position
 .literal .LC0_1_32, 4205

 # Program Unit: TestCode
 .type TestCode, @function
 .align 4
 .global TestCode
TestCode: # 0x4
 # uwArrVar01 = 64
 # wArrVar02 = 0
 # uwArrVar03 = 80
 # rtom_spill__TIE_HiDSP170_vec8x16_temp_0 = 112
 # gra_spill_temp_1 = 128
 # gra_spill_temp_2 = 132
.LBB1_TestCode: # 0x4
#<freq> BB:1 => BB:16 probability = 0.02929
#<freq> BB:1 => BB:3 probability = 0.97071
 .frequency 1.000 0.000
 entry a1,176 #
 beqz a4,.Lt_0_4866 # [1]

#.LBB3_TestCode: # 0xa
{ # format h64
 neg a9,a2 # [0]
 nop #
 slli a10,a4,8 # [0]
}
{ # format h64
 movi a12,16 # [1]
 nop #
 movi a13,1 # [1]
}
{ # format x64
 movvr40 v1,a13 # [2]
 nop #
 nop #
}
{ # format x64
 movvr40 v0,a12 # [3]
 nop #
 nop #
}
{ # format h64
 add a10,a10,a2 # [4]
 nop #
 mov.n a4,a2 # [4]
}
{ # format h64
 s32i a9,a1,132 # [5] gra_spill_temp_2
 nop #
 s32i a10,a1,128 # [5] gra_spill_temp_1
}

.Lt_0_5378: # 0x3a
#<loop> Loop body line 76, nesting depth: 1, estimated iterations: 100
#<swpf> non-innermost loop
 .frequency 0.971 47.808
 l32i.n a14,a3,0 # [0] id:174
{ # format h64
 l32i a13,a4,0 # [1] id:166
 nop #
 l32i a12,a4,20 # [1] id:168
}
{ # format h64
 l32i a11,a4,40 # [2] id:170
 nop #
 l32i a10,a4,60 # [2] id:172
}
{ # format h64
 s32i a10,a1,76 # [3] uwArrVar01+12
 nop #
 s32i a11,a1,72 # [3] uwArrVar01+8
}
{ # format h64
 s32i a12,a1,68 # [4] uwArrVar01+4
 nop #
 s32i a13,a1,64 # [4] uwArrVar01
}
{ # format h64
 lvs32.i v5,a1,64 # [5] uwArrVar01
 nop #
 addi a9,a14,-12 # [5]
}
{ # format x64
 movvr40 v4,a9 # [6]
 nop #
 sshvr.sat40.4x40 v5,v5,v0 # [6]
}
{ # format x64
 nop #
 nop #
 sshvl.sat40.4x40 v4,v1,v4 # [7]
}
{ # format x64
 nop #
 nop #
 add40 v4,v4,v5 # [8]
}
{ # format x64
 svs32.i v4,a1,64 # [10] uwArrVar01
 nop #
 nop #
}
{ # format h64
 l32i a15,a1,68 # [11] uwArrVar01+4
 nop #
 l32i a12,a1,64 # [11] uwArrVar01
}
{ # format h64
 l32i a8,a1,76 # [12] uwArrVar01+12
 nop #
 l32i a13,a1,72 # [12] uwArrVar01+8
}
{ # format h64
 l32i a9,a1,132 # [14] gra_spill_temp_2
 nop #
 mov.n a11,a5 # [14]
}
{ # format h64
 or a13,a13,a8 # [15]
 nop #
 or a12,a12,a15 # [15]
}
{ # format h64
 addi a10,a1,-16 # [16] wArrVar02-16
 nop #
 or a12,a12,a13 # [16]
}
{ # format h64
 nsau a12,a12 # [17]
 nop #
 add a9,a4,a9 # [17]
}
{ # format h64
 addi a12,a12,-17 # [18]
 nop #
 add a14,a14,a12 # [18]
}
{ # format x64
 movvr20 v2,a12 # [19]
 nop #
 sshvl.sat40.4x40 v10,v4,v0 # [19]
}
{ # format h64
 addi a14,a14,-17 # [20]
 nop #
 add a9,a9,a2 # [20]
}
{ # format h64
 s32i a14,a3,0 # [21] id:178
 nop #
 addi a9,a9,-16 # [21]
}

#.LBB21_TestCode: # 0xd5
#<loop> Part of loop body line 76, head labeled .Lt_0_5378
#<loop> unrolled 4 times (fully)
{ # format x64
 lvs16.iu v9,a9,16 # [0] id:180
 nop #
 nop #
}
{ # format x64
 nop #
 nop #
 sshvl.sat20.8x20 v9,v9,v2 # [1]
}
{ # format x64
 svs16.iu v9,a10,16 # [3] id:181 wArrVar02+0x0
 nop #
 nop #
}
{ # format x64
 lvs16.iu v8,a9,16 # [4] id:180
 nop #
 nop #
}
{ # format x64
 nop #
 nop #
 sshvl.sat20.8x20 v8,v8,v2 # [5]
}
{ # format x64
 svs16.iu v8,a10,16 # [7] id:181 wArrVar02+0x0
 nop #
 nop #
}
{ # format x64
 lvs16.iu v7,a9,16 # [8] id:180
 nop #
 nop #
}
{ # format x64
 nop #
 nop #
 sshvl.sat20.8x20 v7,v7,v2 # [9]
}
{ # format x64
 svs16.iu v7,a10,16 # [11] id:181 wArrVar02+0x0
 nop #
 nop #
}
{ # format x64
 lvs16.iu v6,a9,16 # [12] id:180
 nop #
 nop #
}
{ # format x64
 nop #
 nop #
 sshvl.sat20.8x20 v6,v6,v2 # [13]
}
{ # format x64
 svs16.iu v6,a10,16 # [15] id:181 wArrVar02+0x0
 nop #
 nop #
}

#.LBB19_TestCode: # 0x135
#<loop> Part of loop body line 76, head labeled .Lt_0_5378
#<freq> BB:19 => BB:10 probability = 0.86517
#<freq> BB:19 => BB:8 probability = 0.13483
{ # format x64
 svs32.i v10,a1,64 # [0] uwArrVar01
 nop #
 nop #
}
 bnei a5,2,.L_0_8706 # [1]

#.LBB8_TestCode: # 0x140
#<loop> Part of loop body line 76, head labeled .Lt_0_5378
#<freq> BB:8 => BB:9 probability = 0.50000
#<freq> BB:8 => BB:10 probability = 0.50000
{ # format h64
 l32i a10,a1,48 # [0] wArrVar02+48
 nop #
 l32i a12,a1,56 # [0] wArrVar02+56
}
{ # format h64
 movi a11,0 # [1]
 nop #
 l32i a13,a1,52 # [1] wArrVar02+52
}
{ # format h64
 l32i a14,a1,36 # [2] wArrVar02+36
 nop #
 l32i a8,a1,16 # [2] wArrVar02+16
}
{ # format h64
 s32i a8,a1,80 # [3] uwArrVar03
 nop #
 l32i a15,a1,32 # [3] wArrVar02+32
}
{ # format h64
 s32i a15,a1,84 # [4] uwArrVar03+4
 nop #
 s32i a14,a1,88 # [4] uwArrVar03+8
}
{ # format h64
 s32i a11,a1,104 # [5] uwArrVar03+24
 nop #
 s32i a11,a1,108 # [5] uwArrVar03+28
}
{ # format h64
 s32i a13,a1,96 # [6] uwArrVar03+16
 nop #
 s32i a12,a1,100 # [6] uwArrVar03+20
}
{ # format h64
 lvs16.i v13,a1,96 # [7] uwArrVar03+16
 nop #
 s32i a10,a1,92 # [7] uwArrVar03+12
}
{ # format x64
 lvs16.i v12,a1,80 # [8] uwArrVar03
 nop #
 abs20 v13,v13 # [8]
}
{ # format x64
 nop #
 nop #
 abs20 v12,v12 # [9]
}
{ # format x64
 lvs16.i v11,a1,64 # [10] uwArrVar01
 nop #
 add20 v12,v12,v13 # [10]
}
{ # format x64
 movi a9,4 # [11]
 nop #
 abs20 v11,v11 # [11]
}
{ # format x64
 movvr20 v13,a9 # [12]
 nop #
 radd20 v12,v12 # [12]
}
{ # format x64
 nop #
 nop #
 sshvr.sat20.8x20 v12,v12,v13 # [13]
}
{ # format x64
 l32r a14,.LC0_1_32 # [14]
 nop #
 radd20 v11,v11 # [14]
}
{ # format x64
 movar16 a8,v12 # [15]
 nop #
 sshvr.sat20.8x20 v11,v11,v13 # [15]
}
{ # format x64
 movar16 a15,v11 # [17]
 nop #
 nop #
}
 mul16s a14,a14,a8 # [18]
 srai a14,a14,11 # [20]
 blt a14,a15,.L_0_8706 # [21]

#.LBB9_TestCode: # 0x1d1
#<loop> Part of loop body line 76, head labeled .Lt_0_5378
 mov.n a11,a5 #

.L_0_8706: # 0x1d4
.Lt_0_6658: # 0x1d4
#<loop> Part of loop body line 76, head labeled .Lt_0_5378
#<freq> BB:10 => BB:11 probability = 0.13483
#<freq> BB:10 => BB:12 probability = 0.86517
 .frequency 3.288 45.491
 bnez a11,.Lt_0_7170 #

#.LBB11_TestCode: # 0x1d7
#<loop> Part of loop body line 76, head labeled .Lt_0_5378
 movi.n a9,0 # [0]
{ # format x64
 movvr20 v15,a9 # [1]
 nop #
 nop #
}
{ # format x64
 svs16.i v15,a1,112 # [3] rtom_spill__TIE_HiDSP170_vec8x16_temp_0
 nop #
 nop #
}
{ # format x64
 lvs16.i v14,a1,112 # [4] rtom_spill__TIE_HiDSP170_vec8x16_temp_0
 nop #

实例 | 阅读 6736 次

文章评论，共1条

vfdff(作者)

2012-08-05 11:02

Audio DSP, Baseband DSP, or Your Own Customized Value-Add Engine – Tensilica’s IP Cores Excel in the SOC Dataplane 
Tensilica is the #1 supplier of audio DSP IP cores and 4G baseband DSP IP cores for the mobile, handset, and home entertainment markets. 
 
In fact, no matter what the function is, if your SOC design demands a highly-efficient, programmable computational engine for a data-intensive task, our innovative technology can provide a solution for you. 
 
For the most common and broadly applicable tasks in the dataplane, Tensilica has ready made solutions like our HiFi Audio DSPs, our ConnX Communications DSPs and our Diamond Standard controllers for deeply embedded dataplane control. 
 
For more specialized tasks, you can rapidly build your own customized dataplane processor for tasks like image signal processing, video processing, security protocol processing, or network packet processing using our Xtensa Processor Generator. 
 
Why do Tensilica’s customers keep coming back for more? Two main reasons: 
 
1.They like our comprehensive, total solutions for audio and baseband DSPs. 
2.They’ve discovered that it’s easy to develop their own programmable cores with Tensilica’s customized processors that provide much higher performance with lower area and lower power. This is essential for tasks in the SOC dataplane and is why we call our processor cores DPUs (dataplane processing units). 
 
 
DPUs Do the Hard Stuff – the Data Processing that the Control Processor Can’t Do 
 
Where Tensilica’s cores really shine is in the dataplane - the "other" part of the chip where typically designers use RTL blocks to do the "heavy lifting". The problem with those RTL blocks is that they take a long time to design, take even longer to verify, and are not programmable for making changes post-silicon. 
 
DPUs combine the best of CPUs and DSPs with 10-to-100x the performance. DPUs can do BOTH performance intensive DSP (audio, video, imaging, and baseband signal processing) and embedded RISC CPU processing functions (security, networking, and deeply embedded control). 
 
#1 in Audio DSP IP Cores 
Tensilica offers the best audio DSP IP cores in the industry. With a full range of audio codecs plus sound-enhancement software from industry leaders AM3D, Dolby, DTS, QSound, and SRS, Tensilica’s HiFi Audio is becoming the de facto standard for high-quality audio. In home entertainment, we were the first IP provider certified for DTS Master Audio and we offer outstanding support for Dolby codecs and HD Radio. Because they require so little power, Tensilica’s HiFi Audio DSPs have brought home entertainment quality to smartphones, digital cameras and other portable electronic devices. 
 
 
 
#1 in LTE DSP IP Cores 
Why have so many companies picked Tensilica’s DSPs for their 4G LTE designs? With the widest range of DSP IP cores, Tensilica can provide the performance level you need for your design, from our basic ConnX D2 dual-MAC DSP all the way up to our newest ConnX BBE64, a 64 – 128 MAC DSP designed specifically to meet the throughput challenges of LTE Advanced communications. Our Atlas reference architecture implements the entire 3GPP LTE layer 1 PHY, including the computationally demanding Turbo decoder, in a completely processor-based, fully programmable DSP core reference architecture. 
 
Xtensa Customizable Processors 
You can get the best performance, lowest power and smallest size by customizing a processor for your exact application. There are three fundamental ways you can optimize Xtensa processors: 
 
1.You can stream data into and out of the processor directly, without going through the processor bus. This means no load/store overhead and RTL-like performance. 
2.You can use our check-box configuration options to pick just what you need in terms of memories, interfaces, and more. 
3.You can add custom instructions that merge several operations into one, do something unique for your product, and allow for fast parallel execution. 
Diamond Standard Controllers 
Tensilica offers a wide range of standard controller options from very small, 32-bit ultra-low-power, cache-less RISC controller up to a powerful high-performance 3-issue VLIW CPU. 
 
Backed by the Best Tools in the Industry 
Whether you're a hardware designer or a software developer, an inexperienced engineer or an experienced Tensilica user, Tensilica has a comprehensive set of tools to make your job much easier and more productive. Our tools that help you design a custom processor are unrivalled in our industry. 
 
Every processor you get from Tensilica, be it a standard DSP or a processor you customized yourself, comes with an automatically generated, matching software tool chain that includes an outstanding C/C++ compiler, ISS, debugger, and code generation and analysis tools that will speed the software development process. 
 
Base Architecture and Cool Technology 
In this section, you'll read about the architecture behind all of these products - Tensilica's efficient, low-power Xtensa architecture. You also can read about the complete software tool chain, third party ecosystem, EDA flows, and models available with every Tensilica processor.

赞回复

vfdff的博客

vfdff

浏览2030355次