Pyrogenesis  13997
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
wprofiler.cpp
Go to the documentation of this file.
1 /* Copyright (c) 2010 Wildfire Games
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining
4  * a copy of this software and associated documentation files (the
5  * "Software"), to deal in the Software without restriction, including
6  * without limitation the rights to use, copy, modify, merge, publish,
7  * distribute, sublicense, and/or sell copies of the Software, and to
8  * permit persons to whom the Software is furnished to do so, subject to
9  * the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "precompiled.h"
24 
25 #if 0
26 
27 // we need a means of measuring performance, since it is hard to predict and
28 // depends on many factors. to cover a wider range of configurations, this
29 // must also be possible on end-user systems lacking specialized developer
30 // tools. therefore, we must ship our own implementation; this complements
31 // Intel VTune et al.
32 //
33 // there are 3 approaches to the problem:
34 // - single-step analysis logs every executed instruction. very thorough, but
35 // intolerably slow (~1000x) and not suitable for performance measurement.
36 // - intrusive measuring tracks execution time of explicitly marked
37 // functions or 'zones'. more complex, requires adding code, and
38 // inaccurate when thread switches are frequent.
39 // - IP sampling records the current instruction pointer at regular
40 // intervals; slow sections of code will over time appear more often.
41 // not exact, but simple and low-overhead.
42 //
43 // we implement IP sampling due to its simplicity. an intrusive approach
44 // might also be added later to account for performance per-module
45 // (helps spot the culprit in case hotspots are called from multiple sites).
46 
47 
48 // on Windows, we retrieve the current IP with GetThreadContext. dox require
49 // this to happen from another thread, and for the target to be suspended
50 // (now enforced by XP SP2). this leads to all sorts of problems:
51 // - if the suspended thread was dispatching an exception in the kernel,
52 // register state may be a mix between the correct values and
53 // those captured from the exception.
54 // - if running on Win9x with real-mode drivers, interrupts may interfere
55 // with GetThreadContext. however, it's not supported anyway due to other
56 // deficiencies (e.g. lack of proper mmap support).
57 // - the suspended thread may be holding locks; we need to be extremely
58 // careful to avoid deadlock! many win api functions acquire locks in
59 // non-obvious ways.
60 
61 static HANDLE prof_target_thread;
62 
63 static pthread_t prof_thread;
64 
65 // delay [ms] between samples. OS sleep timers usually provide only
66 // ms resolution. increasing interval reduces overhead and accuracy.
67 static const int PROFILE_INTERVAL_MS = 1;
68 
69 
70 static uintptr_t get_target_pc()
71 {
72  DWORD ret;
73  HANDLE hThread = prof_target_thread; // convenience
74 
75  ret = SuspendThread(hThread);
76  if(ret == (DWORD)-1)
77  {
78  DEBUG_WARN_ERR(ERR::LOGIC); // get_target_pc: SuspendThread failed
79  return 0;
80  }
81  // note: we don't need to call more than once: this increments a DWORD
82  // 'suspend count'; target is guaranteed to be suspended unless
83  // the function failed.
84 
85  /////////////////////////////////////////////
86 
87  // be VERY CAREFUL to avoid anything that may acquire a lock until
88  // after ResumeThread! this includes locks taken by the OS,
89  // e.g. malloc -> heap or GetProcAddress -> loader.
90  // reason is, if the target thread was holding a lock we try to
91  // acquire here, a classic deadlock results.
92 
93  uintptr_t pc = 0; // => will return 0 if GetThreadContext fails
94 
95  CONTEXT context;
96  context.ContextFlags = CONTEXT_CONTROL;
97  if(GetThreadContext(hThread, &context))
98  pc = context.PC_;
99 
100  /////////////////////////////////////////////
101 
102  ret = ResumeThread(hThread);
103  ENSURE(ret != 0);
104  // don't fail (we have a valid PC), but warn
105 
106  return pc;
107 }
108 
109 
110 static pthread_t thread;
111 static sem_t exit_flag;
112 
113 static void* prof_thread_func(void* UNUSED(data))
114 {
115  debug_SetThreadName("eip_sampler");
116 
117  const long _1e6 = 1000000;
118  const long _1e9 = 1000000000;
119 
120  for(;;)
121  {
122  // calculate absolute timeout for sem_timedwait
123  struct timespec abs_timeout;
124  clock_gettime(CLOCK_REALTIME, &abs_timeout);
125  abs_timeout.tv_nsec += PROFILE_INTERVAL_MS * _1e6;
126  // .. handle nanosecond wraparound (must not be > 1000m)
127  if(abs_timeout.tv_nsec >= _1e9)
128  {
129  abs_timeout.tv_nsec -= _1e9;
130  abs_timeout.tv_sec++;
131  }
132 
133  errno = 0;
134  // if we acquire the semaphore, exit was requested.
135  if(sem_timedwait(&exit_flag, &abs_timeout) == 0)
136  break;
137  // actual error: warn
138  if(errno != ETIMEDOUT)
139  DEBUG_WARN_ERR(ERR::LOGIC); // wpcu prof_thread_func: sem_timedwait failed
140 
141  uintptr_t pc = get_target_pc();
142  UNUSED2(pc);
143 
144  // ADD TO LIST
145  }
146 
147  return 0;
148 }
149 
150 
151 
152 // call from thread that is to be profiled
153 Status prof_start()
154 {
155  // we need a real HANDLE to the target thread for use with
156  // Suspend|ResumeThread and GetThreadContext.
157  // alternative: DuplicateHandle on the current thread pseudo-HANDLE.
158  // this way is a bit more obvious/simple.
159  const DWORD access = THREAD_GET_CONTEXT|THREAD_SUSPEND_RESUME;
160  HANDLE hThread = OpenThread(access, FALSE, GetCurrentThreadId());
161  if(hThread == INVALID_HANDLE_VALUE)
163 
164  prof_target_thread = hThread;
165 
166  sem_init(&exit_flag, 0, 0);
167  pthread_create(&thread, 0, prof_thread_func, 0);
168  return INFO::OK;
169 }
170 
171 Status prof_shutdown()
172 {
173  WARN_IF_FALSE(CloseHandle(prof_target_thread));
174  return INFO::OK;
175 }
176 
177 
178 
179 /*
180 open question: how to store the EIP values returned? some background:
181 the mechanism above churns out an EIP value (may be in our process, but might
182 also be bogus); we need to store it somehow pending analysis.
183 
184 when done with the current run, we'd want to resolve EIP -> function name,
185 source file etc. (rather slow, so don't do it at runtime).
186 
187 so, how to store it in the meantime? 2 possibilities:
188 - simple array/vector of addresses (of course optimized to reduce allocs)
189 - fixed size array of 'bins' (range of addresses; may be as fine as 1 byte);
190 each bin has a counter which is incremented when the bin's corresponding
191 address has been hit.
192 
193 it's a size tradeoff here; for simple runs of < 1 min (60,000 ms), #1
194 would use 240kb of mem. #2 requires sizeof_whole_program * bytes_per_counter
195 up front, and has problems measuring DLLs (we'd have to explicitly map
196 the DLL address range into a bin - ugh). however, if we ever want to
197 test for say an hour (improves accuracy of profiling due to larger sample size),
198 #1 would guzzle 15mb of memory.
199 
200 hm, another idea would be to write out #1's list of addresses periodically.
201 to make sure the disk I/O doesn't come at a bad time, we could have the main
202 thread call into the profiler and request it write out at that time.
203 this would require extreme caution to avoid the deadlock problem, but looks
204 doable.
205 
206 -------- [2] ----------
207 
208 realistic profiler runs will take up to an hour.
209 
210 writing out to disk would work: could have main thread call back.
211 that and adding EIP to list would be atomic (locked).
212 BUT: large amount of data, that's bad (loading at 30mb/s => 500ms load time alone)
213 
214 problem with enumerating all symbols at startup: how do we enum all DLLs?
215 
216 hybrid idea: std::map of EIPs. we don't build the map at startup,
217 but add when first seen and subsequently increment counter stored there.
218 problem: uses more memory/slower access than list.
219 would have to make sure EIPs are reused.
220 to help that, could quantize down to 4 byte (or so) bins.
221 accessing debug information at runtime to determine function length is too slow.
222 
223 maybe some weird data structure: one bucket controls say 256 bytes of code
224 bucket is found by stripping off lower 8 bits. then, store only
225 the hit count for that byte. where's the savings over normal count?
226 
227 TODO: what if the thread is sleeping at the time we query EIP?
228 can't detect that - suspend count is only set by SuspendThread
229 do we want to report that point (it's good to know), or try to access other threads?
230 
231 TODO split off target thread / get PC into sysdep; profiler thread is portable!
232 
233 
234 at exit: resolve list to hotspots
235 probably hard; a start would be just the function in which the address is, then hit count
236 
237 
238 ==========================================
239 
240 
241 */
242 #endif
const Status LOGIC
Definition: status.h:409
#define UNUSED(param)
mark a function parameter as unused and avoid the corresponding compiler warning. ...
const Status OK
Definition: status.h:386
#define ETIMEDOUT
Definition: werrno.h:106
int clock_gettime(clockid_t clock, struct timespec *ts)
Definition: wtime.cpp:121
static const i64 _1e9
Definition: wtime.cpp:45
int sem_timedwait(sem_t *sem, const struct timespec *abs_timeout)
Definition: wpthread.cpp:520
LIB_API void debug_SetThreadName(const char *name)
inform the debugger of the current thread&#39;s name.
Definition: bdbg.cpp:126
#define ENSURE(expr)
ensure the expression &lt;expr&gt; evaluates to non-zero.
Definition: debug.h:282
#define UNUSED2(param)
mark a function local variable or parameter as unused and avoid the corresponding compiler warning...
int pthread_create(pthread_t *thread_id, const void *attr, void *(*func)(void *), void *arg)
Definition: wpthread.cpp:636
void * HANDLE
Definition: wgl.h:62
unsigned long DWORD
Definition: wgl.h:56
int sem_init(sem_t *sem, int pshared, unsigned value)
Definition: wpthread.cpp:444
i64 Status
Error handling system.
Definition: status.h:171
uintptr_t sem_t
Definition: wpthread.h:105
#define DEBUG_WARN_ERR(status)
display the error dialog with text corresponding to the given error code.
Definition: debug.h:331
uintptr_t pthread_t
Definition: wpthread.h:63
#define WARN_IF_FALSE(expression)
Definition: status.h:360
#define WARN_RETURN(status)
Definition: status.h:255
const Status FAIL
Definition: status.h:406
Definition: wtime.h:62
static const long _1e6
Definition: wtime.cpp:43