Discussion:
[PATCH RFC] allow inline intrinsics for __ieee754_sqrt/f
Jon Beniston
2018-06-22 10:49:03 UTC
Permalink
Hi,

Most functions in libm call __ieee754_sqrt when needing to perform a square
root. For most targets, this results in the s/w implementation in
math/e_sqrt.c being using, even if the target has a h/w sqrt instruction.
There are some targets that have machine specific implementations in
machine/*/, but even if a single instruction, that code typically doesn't
get inlined.

The following patch is one possible way to allow a sqrt instruction to be
used and for the calls to be inlined. I've just done this for x86/arm for
now. I've put this in include/machine/ieeefp.h, rather than fdlibm.h, as
that's where most of the other target specific code seems to be.

Not sure if using the __IEEE754_INLINE_SQRT* macros is the best way to
prevent redefinition errors. Perhaps someone has a better idea?

Cheers,
Jon

diff --git a/newlib/libc/include/machine/ieeefp.h
b/newlib/libc/include/machine/ieeefp.h
index 2fb2268ce..e917d74b0 100644
--- a/newlib/libc/include/machine/ieeefp.h
+++ b/newlib/libc/include/machine/ieeefp.h
@@ -87,6 +87,39 @@
# define __IEEE_BYTES_LITTLE_ENDIAN
# endif
#endif
+
+#if (__ARM_FP & 0x8) && !defined(__SOFTFP__)
+#define __IEEE754_INLINE_SQRT
+static inline double
+__ieee754_sqrt(double x)
+{
+ double result;
+#if __ARM_ARCH >= 6
+ __asm__ ("vsqrt.f64 %P0, %P1" : "=w" (result) : "w" (x));
+#else
+ /* VFP9 Erratum 760019, see GCC sources "gcc/config/arm/vfp.md" */
+ __asm__ ("vsqrt.f64 %P0, %P1" : "=&w" (result) : "w" (x));
+#endif
+ return result;
+}
+#endif
+
+#if (__ARM_FP & 0x4) && !defined(__SOFTFP__)
+#define __IEEE754_INLINE_SQRTF
+static inline float
+__ieee754_sqrtf(float x)
+{
+ float result;
+#if __ARM_ARCH >= 6
+ __asm__ ("vsqrt.f32 %0, %1" : "=w" (result) : "w" (x));
+#else
+ /* VFP9 Erratum 760019, see GCC sources "gcc/config/arm/vfp.md" */
+ __asm__ ("vsqrt.f32 %0, %1" : "=&w" (result) : "w" (x));
+#endif
+ return result;
+}
+#endif
+
#endif

#if defined (__aarch64__)
@@ -189,6 +222,25 @@

#ifdef __i386__
#define __IEEE_LITTLE_ENDIAN
+
+#define __IEEE754_INLINE_SQRT
+static inline double
+__ieee754_sqrt (double x)
+{
+ double result;
+ __asm__ ("fsqrt" : "=t" (result) : "0" (x));
+ return result;
+}
+
+#define __IEEE754_INLINE_SQRTF
+static inline float
+__ieee754_sqrtf (float x)
+{
+ float result;
+ __asm__ ("fsqrt" : "=t" (result) : "0" (x));
+ return result;
+}
+
#endif

#ifdef __riscv
diff --git a/newlib/libm/common/fdlibm.h b/newlib/libm/common/fdlibm.h
index 4523e8b2a..7eccce2b6 100644
--- a/newlib/libm/common/fdlibm.h
+++ b/newlib/libm/common/fdlibm.h
@@ -149,7 +149,9 @@ extern double significand __P((double));
extern long double __ieee754_hypotl __P((long double, long double));

/* ieee style elementary functions */
+#ifndef __IEEE754_INLINE_SQRT
extern double __ieee754_sqrt __P((double));
+#endif
extern double __ieee754_acos __P((double));
extern double __ieee754_acosh __P((double));
extern double __ieee754_log __P((double));
@@ -195,7 +197,9 @@ extern float scalbf __P((float, float));
extern float significandf __P((float));

/* ieee style elementary float functions */
+#ifndef __IEEE754_INLINE_SQRTF
extern float __ieee754_sqrtf __P((float));
+#endif
extern float __ieee754_acosf __P((float));
extern float __ieee754_acoshf __P((float));
extern float __ieee754_logf __P((float));
diff --git a/newlib/libm/machine/arm/e_sqrt.c
b/newlib/libm/machine/arm/e_sqrt.c
index 6f3eb8301..8d50ae234 100644
--- a/newlib/libm/machine/arm/e_sqrt.c
+++ b/newlib/libm/machine/arm/e_sqrt.c
@@ -24,7 +24,7 @@
* SUCH DAMAGE.
*/

-#if (__ARM_FP & 0x8) && !defined(__SOFTFP__)
+#if (__ARM_FP & 0x8) && !defined(__SOFTFP__) &&
!defined(__IEEE754_INLINE_SQRT)
#include <math.h>

double
diff --git a/newlib/libm/machine/arm/ef_sqrt.c
b/newlib/libm/machine/arm/ef_sqrt.c
index 3a1ba6cb4..3d8fd1191 100644
--- a/newlib/libm/machine/arm/ef_sqrt.c
+++ b/newlib/libm/machine/arm/ef_sqrt.c
@@ -24,7 +24,7 @@
* SUCH DAMAGE.
*/

-#if (__ARM_FP & 0x4) && !defined(__SOFTFP__)
+#if (__ARM_FP & 0x4) && !defined(__SOFTFP__) &&
!defined(__IEEE754_INLINE_SQRTF)
#include <math.h>

float
diff --git a/newlib/libm/math/e_sqrt.c b/newlib/libm/math/e_sqrt.c
index 78fc52417..313ae972c 100644
--- a/newlib/libm/math/e_sqrt.c
+++ b/newlib/libm/math/e_sqrt.c
@@ -83,6 +83,8 @@

#include "fdlibm.h"

+#ifndef __IEEE754_INLINE_SQRTF
+
#ifndef _DOUBLE_IS_32BITS

#ifdef __STDC__
@@ -194,6 +196,8 @@ static double one = 1.0, tiny=1.0e-300;

#endif /* defined(_DOUBLE_IS_32BITS) */

+#endif /* __IEEE754_INLINE_SQRTF */
+
/*
Other methods (use floating-point arithmetic)
-------------
diff --git a/newlib/libm/math/ef_sqrt.c b/newlib/libm/math/ef_sqrt.c
index 80e7f360e..9940bad32 100644
--- a/newlib/libm/math/ef_sqrt.c
+++ b/newlib/libm/math/ef_sqrt.c
@@ -15,6 +15,8 @@

#include "fdlibm.h"

+#ifndef __IEEE754_INLINE_SQRT
+
#ifdef __STDC__
static const float one = 1.0, tiny=1.0e-30;
#else
@@ -87,3 +89,5 @@ static float one = 1.0, tiny=1.0e-30;
SET_FLOAT_WORD(z,ix);
return z;
}
+
+#endif /* __IEEE754_INLINE_SQRT */
Corinna Vinschen
2018-06-25 11:41:37 UTC
Permalink
Post by Jon Beniston
Hi,
Most functions in libm call __ieee754_sqrt when needing to perform a square
root. For most targets, this results in the s/w implementation in
math/e_sqrt.c being using, even if the target has a h/w sqrt instruction.
There are some targets that have machine specific implementations in
machine/*/, but even if a single instruction, that code typically doesn't
get inlined.
The following patch is one possible way to allow a sqrt instruction to be
used and for the calls to be inlined. I've just done this for x86/arm for
now. I've put this in include/machine/ieeefp.h, rather than fdlibm.h, as
that's where most of the other target specific code seems to be.
Not sure if using the __IEEE754_INLINE_SQRT* macros is the best way to
prevent redefinition errors. Perhaps someone has a better idea?
Cheers,
Jon
diff --git a/newlib/libc/include/machine/ieeefp.h
[...]
#ifdef __i386__
#define __IEEE_LITTLE_ENDIAN
+
+#define __IEEE754_INLINE_SQRT
+static inline double
+__ieee754_sqrt (double x)
+{
+ double result;
+ __asm__ ("fsqrt" : "=t" (result) : "0" (x));
+ return result;
+}
+
+#define __IEEE754_INLINE_SQRTF
+static inline float
+__ieee754_sqrtf (float x)
+{
+ float result;
+ __asm__ ("fsqrt" : "=t" (result) : "0" (x));
+ return result;
+}
+
#endif
Can we really imply that an i386 always comes with a FPU?


Corinna
--
Corinna Vinschen
Cygwin Maintainer
Red Hat
Jon Beniston
2018-06-25 13:04:08 UTC
Permalink
Hi Corinna,
Post by Corinna Vinschen
Can we really imply that an i386 always comes with a FPU?
Looking through gcc's i386-c.c, we could use #ifndef _SOFT_FLOAT around that.

Cheers,
Jon
Hans-Bernhard Bröker
2018-06-26 17:01:55 UTC
Permalink
Post by Corinna Vinschen
Can we really imply that an i386 always comes with a FPU?
Given that that would be patently untrue, I'm pretty sure we can't.

Strictly speaking one couldn't even imply it for i486, "thanks" to the
486SX (and its incredibly silly optional companion, the 487SX).
j***@beniston.com
2018-08-17 12:05:01 UTC
Permalink
Post by Corinna Vinschen
Can we really imply that an i386 always comes with a FPU?
Here's an updated patch that doesn't inline sqrt on i386 if _SOFT_FLOAT is defined (This is what GCC seems to define to indicate if the h/w FPU is not available).

Cheers,
Jon
Wilco Dijkstra
2018-08-17 13:34:37 UTC
Permalink
Hi,

The best option is to let the compiler inline sqrt - it knows when it is feasible
and avoids having to add lots of target specific inline assembly code which is
hard to maintain.

In GLIBC I renamed all __ieee754_sqrt(f) uses to sqrt(f) and added -fno-math-errno
which allows compilers to inline sqrt on all targets. libm/common already uses
-fno-math-errno, but could be used in all of libm safely.

Wilco
j***@beniston.com
2018-08-17 13:47:54 UTC
Permalink
Hi Wilco,
Post by Wilco Dijkstra
The best option is to let the compiler inline sqrt - it knows when it is
feasible and avoids
Post by Wilco Dijkstra
having to add lots of target specific inline assembly code which is hard to
maintain.
Post by Wilco Dijkstra
In GLIBC I renamed all __ieee754_sqrt(f) uses to sqrt(f) and added -fno-math-errno
which allows compilers to inline sqrt on all targets. libm/common already uses
-fno-math-errno, but could be used in all of libm safely.
That works for most of the uses of __ieee754_sqrt, except, I think, for
those in w_sqrt.c / wf_sqrt.c, that implement sqrt & sqrtf. If the target
didn't have a builtin for sqrt, you'd end up with a recursive call, no?

Cheers,
Jon
Wilco Dijkstra
2018-08-17 14:11:24 UTC
Permalink
Post by j***@beniston.com
Post by Wilco Dijkstra
The best option is to let the compiler inline sqrt - it knows when it is
feasible and avoids
Post by j***@beniston.com
Post by Wilco Dijkstra
having to add lots of target specific inline assembly code which is hard to
maintain.
Post by j***@beniston.com
Post by Wilco Dijkstra
In GLIBC I renamed all __ieee754_sqrt(f) uses to sqrt(f) and added
-fno-math-errno
Post by j***@beniston.com
Post by Wilco Dijkstra
which allows compilers to inline sqrt on all targets. libm/common already
uses
Post by j***@beniston.com
Post by Wilco Dijkstra
-fno-math-errno, but could be used in all of libm safely.
That works for most of the uses of __ieee754_sqrt, except, I think, for
those in w_sqrt.c / wf_sqrt.c, that implement sqrt & sqrtf.  If the target
didn't have a builtin for sqrt, you'd end up with a recursive call, no?
You could add more libm/machine/*/w_sqrt.c files. These already exist
for arm and aarch64 - though they use inline assembler rather than the
obvious __builtin_sqrt.

Note that the sqrt function will never be called if you have a sqrt
instruction (even if you forget to use -fno-math-errno), so optimizing
w_sqrt.c is less important than ensuring __ieee754_sqrt gets inlined.

Wilco
j***@beniston.com
2018-08-17 14:18:38 UTC
Permalink
Hi Wilco,
Post by Wilco Dijkstra
You could add more libm/machine/*/w_sqrt.c files.
These already exist for arm and aarch64 - though they use
inline assembler rather than the obvious __builtin_sqrt.
They don't get inlined like that though. So the idea is to move them from
there, in to the header, to allow them to be inlined, saving the call
overhead in sqrt().
Post by Wilco Dijkstra
Note that the sqrt function will never be called if you have a sqrt
instruction (even if you forget to use -fno-math-errno),

It will be from application code.

Cheers,
Jon
Wilco Dijkstra
2018-08-17 14:25:51 UTC
Permalink
Post by Wilco Dijkstra
You could add more libm/machine/*/w_sqrt.c files.
These already exist for arm and aarch64 - though they use
inline assembler rather than the obvious __builtin_sqrt.
They don't get inlined like that though. So the idea is to move them from
there, in to the header, to allow them to be inlined, saving the call
overhead in sqrt().
Sqrt always gets inlined if you have a sqrt instruction.
Post by Wilco Dijkstra
Post by Wilco Dijkstra
Note that the sqrt function will never be called if you have a sqrt
instruction (even if you forget to use -fno-math-errno),
Post by Wilco Dijkstra
It will be from application code.
No, try it. Sqrt instructions always get inlined.

Wilco
j***@beniston.com
2018-08-17 14:41:32 UTC
Permalink
Hi Wilco,
Post by Wilco Dijkstra
Post by j***@beniston.com
Post by Wilco Dijkstra
You could add more libm/machine/*/w_sqrt.c files.
These already exist for arm and aarch64 - though they use inline
assembler rather than the obvious __builtin_sqrt.
They don't get inlined like that though. So the idea is to move them
from there, in to the header, to allow them to be inlined, saving the
call overhead in sqrt().
Sqrt always gets inlined if you have a sqrt instruction.
The code calls __builtin_sqrt, not sqrt. It's that I want to inline. If you
change to call sqrt, you'll potentially get recursion.
Post by Wilco Dijkstra
Post by j***@beniston.com
Post by Wilco Dijkstra
Note that the sqrt function will never be called if you have a sqrt
instruction (even if you forget to use -fno-math-errno),
It will be from application code.
No, try it. Sqrt instructions always get inlined.
On the <0 path, GCC still calls sqrt in order to set errno.

Regards,
Jon
Wilco Dijkstra
2018-08-17 15:01:37 UTC
Permalink
Post by j***@beniston.com
The code calls __builtin_sqrt, not sqrt. It's that I want to inline. If you
change to call sqrt, you'll potentially get recursion.
You mean __ieee754_sqrt? You can replace all calls to it by sqrt,
except for w_sqrt.c and wf_sqrt.c where you leave it as is.
So there is no recursion.
Post by j***@beniston.com
On the <0 path, GCC still calls sqrt in order to set errno.
Sure, but correct code executes the sqrt instruction, not the call.

Wilco
j***@beniston.com
2018-08-17 15:05:24 UTC
Permalink
Hi Wilco,
Post by Wilco Dijkstra
You mean __ieee754_sqrt?
Yes.
Post by Wilco Dijkstra
You can replace all calls to it by sqrt, except for w_sqrt.c and wf_sqrt.c
where you leave it as is.
Post by Wilco Dijkstra
So there is no recursion.
But then it will not be inlined. I'd like it to be.

Cheers,
Jon
Wilco Dijkstra
2018-08-17 15:32:10 UTC
Permalink
Post by j***@beniston.com
Post by Wilco Dijkstra
You can replace all calls to it by sqrt, except for w_sqrt.c and wf_sqrt.c
where you leave it as is.
Post by j***@beniston.com
Post by Wilco Dijkstra
So there is no recursion.
But then it will not be inlined. I'd like it to be.
You could add machine/*/w_sqrt.c too that use sqrt. Or in the generic one
call the builtin rather than __ieee754_sqrt based on a define. But none of
this will improve performance or codesize.

Wilco
j***@beniston.com
2018-08-17 21:43:04 UTC
Permalink
Hi Wilco,
You can replace all calls to it by sqrt, except for w_sqrt.c and wf_sqrt.c
While -fno-builtin is still being used, this would just make the code slower though.
Or in the generic one call the builtin rather than __ieee754_sqrt based on a define.
I looked at doing this, as the builtin seems nicer than inline asm, but the builtin for GCC (not clang) also calls sqrt for negative numbers unless -fno-math-errno is used.

Unfortunately, if this option is used as a function attribute, then it prevents the function from being inlined and the automake documentation suggests you can't do per file options.
But none of this will improve performance or codesize.
Sure it does in my use case, otherwise I wouldn't be bothering 😉

Cheers,
Jon
Wilco Dijkstra
2018-08-24 15:05:44 UTC
Permalink
Post by j***@beniston.com
You can replace all calls to it by sqrt, except for w_sqrt.c and  wf_sqrt.c
While -fno-builtin is still being used, this would just make the code slower though.
No it would be faster even with -fno-builtin given that practically all uses of sqrt will
execute the instruction, not the call.
Post by j***@beniston.com
Or in the generic one call the builtin rather than __ieee754_sqrt based on a define.
I looked at doing this, as the builtin seems nicer than inline asm, but the builtin for
GCC (not clang) also calls sqrt for negative numbers unless -fno-math-errno is used.
I just tried:

if (x >= 0) return __builtin_sqrt(x);

That's faster than the existing code and also works fine with both -fmath-errno
and -fno-builtin.
Post by j***@beniston.com
But none of this will improve performance or codesize.
Sure it does in my use case, otherwise I wouldn't be bothering 😉
That's not possible since sqrt function itself is never called unless the input is
nega
j***@beniston.com
2018-08-24 20:54:00 UTC
Permalink
Hi Wilco,
Post by j***@beniston.com
Post by Wilco Dijkstra
You can replace all calls to it by sqrt, except for w_sqrt.c and
wf_sqrt.c
While -fno-builtin is still being used, this would just make the code slower though.
No it would be faster even with -fno-builtin given that practically all uses of sqrt will execute the instruction, not the call.
Not all targets have a sqrt instruction though.

Cheers,
Jon
Wilco Dijkstra
2018-09-03 16:12:08 UTC
Permalink
Hi Jon,
Post by j***@beniston.com
Post by j***@beniston.com
You can replace all calls to it by sqrt, except for w_sqrt.c and 
wf_sqrt.c
While -fno-builtin is still being used, this would just make the code slower though.
No it would be faster even with -fno-builtin given that practically all uses of sqrt will execute the instruction, not the call.
Not all targets have a sqrt instruction though.
Like I said, it's not really important to optimize this case, but we could have
machine/*/w_sqrt.c or a new define like TARGET_HAS_SQRT. There is already
math_config.h where such a define could be added.

Wilco
j***@beniston.com
2018-09-03 16:52:30 UTC
Permalink
Hi Wilco,
Post by Wilco Dijkstra
Post by j***@beniston.com
Not all targets have a sqrt instruction though.
Like I said, it's not really important to optimize this case, but we could
have machine
Post by Wilco Dijkstra
/*/w_sqrt.c or a new define like TARGET_HAS_SQRT. There is already
math_config.h
Post by Wilco Dijkstra
where such a define could be added.
I'm confused about what you are talking about now.

I was referring to your suggestion that all calls throughout libm to
__ieee754_sqrt() should be replaced with calls to sqrt() as you think gcc
can optimise it. While this may work for targets with a h/w sqrt
instruction, this will be slower for targets that do not have a sqrt
instruction, as you'll end up with the extra overhead and error handling
code in w_sqrt.c being executed, no?

Cheers,
Jon
Wilco Dijkstra
2018-09-03 17:35:29 UTC
Permalink
Hi Jon,
Post by j***@beniston.com
I was referring to your suggestion that all calls throughout libm to
__ieee754_sqrt() should be replaced with calls to sqrt() as you think gcc
can optimise it. While this may work for targets with a h/w sqrt
instruction, this will be slower for targets that do not have a sqrt
instruction, as you'll end up with the extra overhead and error handling
code in w_sqrt.c being executed, no?
Right - I thought your main concern was inlining sqrt instructions since they
are currently not inlined. It wouldn't be slower for targets without sqrt since
removing all calls to __ieee754_sqrt means you can now merge w_sqrt.c
and e_sqrt.c.

Both implementations have a check for < 0, and you only need one now.
All the complex errno handling could be simplified too, check how trivial it
is in the new math code, eg. common/sinf.c.

Wilco
j***@beniston.com
2018-09-04 09:41:32 UTC
Permalink
This post might be inappropriate. Click to display it.
Loading...