summaryrefslogtreecommitdiff
path: root/thirdparty/embree-aarch64/kernels/common/stack_item.h
blob: 533c3853652ed688856b31e367b4302895ee162f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "default.h"

namespace embree
{
  /*! An item on the stack holds the node ID and distance of that node. */
  template<typename T>
  struct __aligned(16) StackItemT
  {
    /*! assert that the xchg function works */
    static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed");

    __forceinline StackItemT() {}

    __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {}

    /*! use SSE instructions to swap stack items */
    __forceinline static void xchg(StackItemT& a, StackItemT& b) 
    { 
      const vfloat4 sse_a = vfloat4::load((float*)&a); 
      const vfloat4 sse_b = vfloat4::load((float*)&b);
      vfloat4::store(&a,sse_b);
      vfloat4::store(&b,sse_a);
    }

    /*! Sort 2 stack items. */
    __forceinline friend void sort(StackItemT& s1, StackItemT& s2) {
      if (s2.dist < s1.dist) xchg(s2,s1);
    }
    
    /*! Sort 3 stack items. */
    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3)
    {
      if (s2.dist < s1.dist) xchg(s2,s1);
      if (s3.dist < s2.dist) xchg(s3,s2);
      if (s2.dist < s1.dist) xchg(s2,s1);
    }
    
    /*! Sort 4 stack items. */
    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4)
    {
      if (s2.dist < s1.dist) xchg(s2,s1);
      if (s4.dist < s3.dist) xchg(s4,s3);
      if (s3.dist < s1.dist) xchg(s3,s1);
      if (s4.dist < s2.dist) xchg(s4,s2);
      if (s3.dist < s2.dist) xchg(s3,s2);
    }

    /*! use SSE instructions to swap stack items */
    __forceinline static void cmp_xchg(vint4& a, vint4& b) 
    { 
#if defined(__AVX512VL__)
      const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a));
#else
      const vboolf4 mask0(b < a);
      const vboolf4 mask(shuffle<2,2,2,2>(mask0));
#endif
      const vint4 c = select(mask,b,a);
      const vint4 d = select(mask,a,b);
      a = c;
      b = d;
    }
    
    /*! Sort 3 stack items. */
    __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3)
    {
      cmp_xchg(s2,s1);
      cmp_xchg(s3,s2);
      cmp_xchg(s2,s1);
    }
    
    /*! Sort 4 stack items. */
    __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4)
    {
      cmp_xchg(s2,s1);
      cmp_xchg(s4,s3);
      cmp_xchg(s3,s1);
      cmp_xchg(s4,s2);
      cmp_xchg(s3,s2);
    }


    /*! Sort N stack items. */
    __forceinline friend void sort(StackItemT* begin, StackItemT* end)
    {
      for (StackItemT* i = begin+1; i != end; ++i)
      {
        const vfloat4 item = vfloat4::load((float*)i);
        const unsigned dist = i->dist;
        StackItemT* j = i;

        while ((j != begin) && ((j-1)->dist < dist))
        {
          vfloat4::store(j, vfloat4::load((float*)(j-1)));
          --j;
        }

        vfloat4::store(j, item);
      }
    }
    
  public:
    T ptr;
    unsigned dist;
  };

  /*! An item on the stack holds the node ID and active ray mask. */
  template<typename T>
  struct __aligned(8) StackItemMaskT
  {
    T ptr;
    size_t mask;
  };

  struct __aligned(8) StackItemMaskCoherent
  {
    size_t mask;
    size_t parent;
    size_t child;
  };
}