From patchwork Fri Apr 17 15:02:11 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrew Senkevich <andrew.n.senkevich@gmail.com>
X-Patchwork-Id: 6300
Received: (qmail 98634 invoked by alias); 17 Apr 2015 15:02:48 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 98532 invoked by uid 89); 17 Apr 2015 15:02:47 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.3 required=5.0 tests=AWL, BAYES_00,
	FREEMAIL_FROM, RCVD_IN_DNSWL_LOW,
	SPF_PASS autolearn=ham version=3.3.2
X-HELO: mail-lb0-f175.google.com
X-Received: by 10.152.44.225 with SMTP id h1mr4423977lam.5.1429282962219;
	Fri, 17 Apr 2015 08:02:42 -0700 (PDT)
MIME-Version: 1.0
In-Reply-To: <alpine.DEB.2.10.1412111756090.21111@digraph.polyomino.org.uk>
References: 
 <CAMXFM3sG=Uo6pJsNn34VtU+VDd4+VO+YyoUw3hCYpFi_K1cLsg@mail.gmail.com>
	<alpine.DEB.2.10.1412031814130.20515@digraph.polyomino.org.uk>
	<CAMXFM3sTwm+pFdALMg4aaBroFbDds=LK1rTH4ksouR=MK-75Wg@mail.gmail.com>
	<alpine.DEB.2.10.1412111756090.21111@digraph.polyomino.org.uk>
From: Andrew Senkevich <andrew.n.senkevich@gmail.com>
Date: Fri, 17 Apr 2015 18:02:11 +0300
Message-ID: 
 <CAMXFM3tOpEoMb-oLoeSJNzOWYnJgte3bm_OR=oZ172n845WeZw@mail.gmail.com>
Subject: Re: [PATCH 7/N v2] [x86_64] Vectorized math functions
To: Joseph Myers <joseph@codesourcery.com>
Cc: libc-alpha <libc-alpha@sourceware.org>

> 2014-12-11 20:57 GMT+03:00 Joseph Myers <joseph@codesourcery.com>:
> On Tue, 9 Dec 2014, Andrew Senkevich wrote:
>
>> Combined and attached, it seems not passed being sent in plain text.
>>
>> ChangeLog
>>
>>         * sysdeps/x86_64/fpu/svml_d_cos2_core.S: New file.
>>         * sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S: New file.
>>         * sysdeps/x86_64/fpu/svml_d_cos4_core_avx2.S: New file.
>>         * sysdeps/x86_64/fpu/svml_d_cos_data.S: New file.
>>         * sysdeps/x86_64/fpu/svml_d_cos_data.h: New file.
>>         * sysdeps/x86_64/fpu/Versions: New file.
>>         * math/bits/mathcalls.h: Added cos declaration with __MATHCALL_VEC.
>>         * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cos.
>>         * sysdeps/x86_64/configure: Regenerated.
>>         * sysdeps/x86_64/configure.ac: Options for libmvec build.
>>         * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New file.
>>         * sysdeps/x86_64/fpu/Makefile: New file.
>
> I'll leave final review for when the ABI document explaining what the
> pragma means for function version availability on x86_64 is ready and
> agreed by compiler implementations, but apart from that this patch may be
> ready.
>
> --
> Joseph S. Myers
> joseph@codesourcery.com

Hi, Joseph,

to support ISAs older than SSE4 we need runtime ISA check and switch
to SSE2 version.
It is because mangled name of vector function is the same for all SSE* ISAs.

I mean implementation is changed like so:

visible)?
---
WBR,
Andrew

diff --git a/sysdeps/x86_64/fpu/svml_d_cos2_core.S
b/sysdeps/x86_64/fpu/svml_d_cos2_core.S
index 11d9c94..ccd0969 100644
--- a/sysdeps/x86_64/fpu/svml_d_cos2_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_cos2_core.S
@@ -1,4 +1,4 @@
-/* Function cos vectorized with SSE4.
+/* Function cos vectorized with SSE2 and SSE4.
    Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.

@@ -17,22 +17,51 @@
    <http://www.gnu.org/licenses/>.  */

 #include <sysdep.h>
+#include <init-arch.h>
 #include "svml_d_cos_data.h"

        .text
 ENTRY(_ZGVbN2v_cos)
+        .type   _ZGVbN2v_cos, @gnu_indirect_function
+        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
+        jne     1f
+        call    __init_cpu_features
+1:      leaq    _ZGVbN2v_cos_sse4(%rip), %rax
+        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+        jz      2f
+        ret
+2:      leaq    _ZGVbN2v_cos_sse2(%rip), %rax
+        ret
+END(_ZGVbN2v_cos)
+
+ENTRY(_ZGVbN2v_cos_sse2)
+/* SSE2 version as wrapper to scalar.  */
+        subq      $40, %rsp
+        movaps    %xmm0, (%rsp)
+        call      cos@PLT
+        movsd     %xmm0, 16(%rsp)
+        movsd     8(%rsp), %xmm0
+        call      cos@PLT
+        movsd     16(%rsp), %xmm1
+        movsd     %xmm0, 24(%rsp)
+        unpcklpd  %xmm0, %xmm1
+        movaps    %xmm1, %xmm0
+        addq      $40, %rsp
+        ret
+END(_ZGVbN2v_cos_sse2)

+ENTRY(_ZGVbN2v_cos_sse4)
 /* ALGORITHM DESCRIPTION:
  *
  *     ( low accuracy ( < 4ulp ) or enhanced performance ( half of correct
@@ -206,4 +235,4 @@ ENTRY(_ZGVbN2v_cos)
         movsd     %xmm0, 256(%rsp,%r15)
         jmp       .LBL_1_7

-END(_ZGVbN2v_cos)
+END(_ZGVbN2v_cos_sse4)

My question is do we need to test this new SSE2 version (which is just
simple wrapper to scalar one)?

If yes, is it ok to have according test linked against libmvec.a (in
libmvec.so this SSE2 implementation is hidden and no need to make it