1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
|
// ======================================================================== //
// Copyright 2009-2019 Intel Corporation //
// //
// Licensed under the Apache License, Version 2.0 (the "License"); //
// you may not use this file except in compliance with the License. //
// You may obtain a copy of the License at //
// //
// http://www.apache.org/licenses/LICENSE-2.0 //
// //
// Unless required by applicable law or agreed to in writing, software //
// distributed under the License is distributed on an "AS IS" BASIS, //
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
// See the License for the specific language governing permissions and //
// limitations under the License. //
// ======================================================================== //
#if defined(_MSC_VER)
#pragma warning (disable : 4146) // unary minus operator applied to unsigned type, result still unsigned
#endif
#if defined(__APPLE__)
#include <mach/thread_act.h>
#include <mach/mach_init.h>
#endif
#include "thread.h"
#include <fstream>
namespace oidn {
#if defined(_WIN32)
// --------------------------------------------------------------------------
// ThreadAffinity - Windows
// --------------------------------------------------------------------------
ThreadAffinity::ThreadAffinity(int numThreadsPerCore, int verbose)
: Verbose(verbose)
{
HMODULE hLib = GetModuleHandle(TEXT("kernel32"));
pGetLogicalProcessorInformationEx = (GetLogicalProcessorInformationExFunc)GetProcAddress(hLib, "GetLogicalProcessorInformationEx");
pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hLib, "SetThreadGroupAffinity");
if (pGetLogicalProcessorInformationEx && pSetThreadGroupAffinity)
{
// Get logical processor information
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer = nullptr;
DWORD bufferSize = 0;
// First call the function with an empty buffer to get the required buffer size
BOOL result = pGetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufferSize);
if (result || GetLastError() != ERROR_INSUFFICIENT_BUFFER)
{
OIDN_WARNING("GetLogicalProcessorInformationEx failed");
return;
}
// Allocate the buffer
buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufferSize);
if (!buffer)
{
OIDN_WARNING("SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX allocation failed");
return;
}
// Call again the function but now with the properly sized buffer
result = pGetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufferSize);
if (!result)
{
OIDN_WARNING("GetLogicalProcessorInformationEx failed");
free(buffer);
return;
}
// Iterate over the logical processor information structures
// There should be one structure for each physical core
char* ptr = (char*)buffer;
while (ptr < (char*)buffer + bufferSize)
{
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX item = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)ptr;
if (item->Relationship == RelationProcessorCore && item->Processor.GroupCount > 0)
{
// Iterate over the groups
int numThreads = 0;
for (int group = 0; (group < item->Processor.GroupCount) && (numThreads < numThreadsPerCore); ++group)
{
GROUP_AFFINITY coreAffinity = item->Processor.GroupMask[group];
while ((coreAffinity.Mask != 0) && (numThreads < numThreadsPerCore))
{
// Extract the next set bit/thread from the mask
GROUP_AFFINITY threadAffinity = coreAffinity;
threadAffinity.Mask = threadAffinity.Mask & -threadAffinity.Mask;
// Push the affinity for this thread
affinities.push_back(threadAffinity);
oldAffinities.push_back(threadAffinity);
numThreads++;
// Remove this bit/thread from the mask
coreAffinity.Mask ^= threadAffinity.Mask;
}
}
}
// Next structure
ptr += item->Size;
}
// Free the buffer
free(buffer);
}
}
void ThreadAffinity::set(int threadIndex)
{
if (threadIndex >= (int)affinities.size())
return;
// Save the current affinity and set the new one
const HANDLE thread = GetCurrentThread();
if (!pSetThreadGroupAffinity(thread, &affinities[threadIndex], &oldAffinities[threadIndex]))
OIDN_WARNING("SetThreadGroupAffinity failed");
}
void ThreadAffinity::restore(int threadIndex)
{
if (threadIndex >= (int)affinities.size())
return;
// Restore the original affinity
const HANDLE thread = GetCurrentThread();
if (!pSetThreadGroupAffinity(thread, &oldAffinities[threadIndex], nullptr))
OIDN_WARNING("SetThreadGroupAffinity failed");
}
#elif defined(__linux__)
// --------------------------------------------------------------------------
// ThreadAffinity - Linux
// --------------------------------------------------------------------------
ThreadAffinity::ThreadAffinity(int numThreadsPerCore, int verbose)
: Verbose(verbose)
{
std::vector<int> threadIds;
// Parse the thread/CPU topology
for (int cpuId = 0; ; cpuId++)
{
std::fstream fs;
std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string(cpuId) + std::string("/topology/thread_siblings_list");
fs.open(cpu.c_str(), std::fstream::in);
if (fs.fail()) break;
int i;
int j = 0;
while ((j < numThreadsPerCore) && (fs >> i))
{
if (std::none_of(threadIds.begin(), threadIds.end(), [&](int id) { return id == i; }))
threadIds.push_back(i);
if (fs.peek() == ',')
fs.ignore();
j++;
}
fs.close();
}
#if 0
for (size_t i = 0; i < thread_ids.size(); ++i)
std::cout << "thread " << i << " -> " << thread_ids[i] << std::endl;
#endif
// Create the affinity structures
affinities.resize(threadIds.size());
oldAffinities.resize(threadIds.size());
for (size_t i = 0; i < threadIds.size(); ++i)
{
cpu_set_t affinity;
CPU_ZERO(&affinity);
CPU_SET(threadIds[i], &affinity);
affinities[i] = affinity;
oldAffinities[i] = affinity;
}
}
void ThreadAffinity::set(int threadIndex)
{
if (threadIndex >= (int)affinities.size())
return;
const pthread_t thread = pthread_self();
// Save the current affinity
if (pthread_getaffinity_np(thread, sizeof(cpu_set_t), &oldAffinities[threadIndex]) != 0)
{
OIDN_WARNING("pthread_getaffinity_np failed");
oldAffinities[threadIndex] = affinities[threadIndex];
return;
}
// Set the new affinity
if (pthread_setaffinity_np(thread, sizeof(cpu_set_t), &affinities[threadIndex]) != 0)
OIDN_WARNING("pthread_setaffinity_np failed");
}
void ThreadAffinity::restore(int threadIndex)
{
if (threadIndex >= (int)affinities.size())
return;
const pthread_t thread = pthread_self();
// Restore the original affinity
if (pthread_setaffinity_np(thread, sizeof(cpu_set_t), &oldAffinities[threadIndex]) != 0)
OIDN_WARNING("pthread_setaffinity_np failed");
}
#elif defined(__APPLE__)
// --------------------------------------------------------------------------
// ThreadAffinity - macOS
// --------------------------------------------------------------------------
ThreadAffinity::ThreadAffinity(int numThreadsPerCore, int verbose)
: Verbose(verbose)
{
// Query the thread/CPU topology
int numPhysicalCpus;
int numLogicalCpus;
if (!getSysctl("hw.physicalcpu", numPhysicalCpus) || !getSysctl("hw.logicalcpu", numLogicalCpus))
{
OIDN_WARNING("sysctlbyname failed");
return;
}
if ((numLogicalCpus % numPhysicalCpus != 0) && (numThreadsPerCore > 1))
return; // this shouldn't happen
const int maxThreadsPerCore = numLogicalCpus / numPhysicalCpus;
// Create the affinity structures
// macOS doesn't support binding a thread to a specific core, but we can at least group threads which
// should be on the same core together
for (int core = 1; core <= numPhysicalCpus; ++core) // tags start from 1!
{
thread_affinity_policy affinity;
affinity.affinity_tag = core;
for (int thread = 0; thread < min(numThreadsPerCore, maxThreadsPerCore); ++thread)
{
affinities.push_back(affinity);
oldAffinities.push_back(affinity);
}
}
}
void ThreadAffinity::set(int threadIndex)
{
if (threadIndex >= (int)affinities.size())
return;
const auto thread = mach_thread_self();
// Save the current affinity
mach_msg_type_number_t policyCount = THREAD_AFFINITY_POLICY_COUNT;
boolean_t getDefault = FALSE;
if (thread_policy_get(thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&oldAffinities[threadIndex], &policyCount, &getDefault) != KERN_SUCCESS)
{
OIDN_WARNING("thread_policy_get failed");
oldAffinities[threadIndex] = affinities[threadIndex];
return;
}
// Set the new affinity
if (thread_policy_set(thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&affinities[threadIndex], THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS)
OIDN_WARNING("thread_policy_set failed");
}
void ThreadAffinity::restore(int threadIndex)
{
if (threadIndex >= (int)affinities.size())
return;
const auto thread = mach_thread_self();
// Restore the original affinity
if (thread_policy_set(thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&oldAffinities[threadIndex], THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS)
OIDN_WARNING("thread_policy_set failed");
}
#endif
} // namespace oidn
|