builtin-sched.c 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808
  1. #include "builtin.h"
  2. #include "perf.h"
  3. #include "util/util.h"
  4. #include "util/evlist.h"
  5. #include "util/cache.h"
  6. #include "util/evsel.h"
  7. #include "util/symbol.h"
  8. #include "util/thread.h"
  9. #include "util/header.h"
  10. #include "util/session.h"
  11. #include "util/tool.h"
  12. #include "util/cloexec.h"
  13. #include "util/parse-options.h"
  14. #include "util/trace-event.h"
  15. #include "util/debug.h"
  16. #include <sys/prctl.h>
  17. #include <sys/resource.h>
  18. #include <semaphore.h>
  19. #include <pthread.h>
  20. #include <math.h>
  21. #include <api/fs/fs.h>
  22. #define PR_SET_NAME 15 /* Set process name */
  23. #define MAX_CPUS 4096
  24. #define COMM_LEN 20
  25. #define SYM_LEN 129
  26. #define MAX_PID 1024000
  27. struct sched_atom;
  28. struct task_desc {
  29. unsigned long nr;
  30. unsigned long pid;
  31. char comm[COMM_LEN];
  32. unsigned long nr_events;
  33. unsigned long curr_event;
  34. struct sched_atom **atoms;
  35. pthread_t thread;
  36. sem_t sleep_sem;
  37. sem_t ready_for_work;
  38. sem_t work_done_sem;
  39. u64 cpu_usage;
  40. };
  41. enum sched_event_type {
  42. SCHED_EVENT_RUN,
  43. SCHED_EVENT_SLEEP,
  44. SCHED_EVENT_WAKEUP,
  45. SCHED_EVENT_MIGRATION,
  46. };
  47. struct sched_atom {
  48. enum sched_event_type type;
  49. int specific_wait;
  50. u64 timestamp;
  51. u64 duration;
  52. unsigned long nr;
  53. sem_t *wait_sem;
  54. struct task_desc *wakee;
  55. };
  56. #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
  57. enum thread_state {
  58. THREAD_SLEEPING = 0,
  59. THREAD_WAIT_CPU,
  60. THREAD_SCHED_IN,
  61. THREAD_IGNORE
  62. };
  63. struct work_atom {
  64. struct list_head list;
  65. enum thread_state state;
  66. u64 sched_out_time;
  67. u64 wake_up_time;
  68. u64 sched_in_time;
  69. u64 runtime;
  70. };
  71. struct work_atoms {
  72. struct list_head work_list;
  73. struct thread *thread;
  74. struct rb_node node;
  75. u64 max_lat;
  76. u64 max_lat_at;
  77. u64 total_lat;
  78. u64 nb_atoms;
  79. u64 total_runtime;
  80. };
  81. typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
  82. struct perf_sched;
  83. struct trace_sched_handler {
  84. int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
  85. struct perf_sample *sample, struct machine *machine);
  86. int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
  87. struct perf_sample *sample, struct machine *machine);
  88. int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
  89. struct perf_sample *sample, struct machine *machine);
  90. /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
  91. int (*fork_event)(struct perf_sched *sched, union perf_event *event,
  92. struct machine *machine);
  93. int (*migrate_task_event)(struct perf_sched *sched,
  94. struct perf_evsel *evsel,
  95. struct perf_sample *sample,
  96. struct machine *machine);
  97. };
  98. struct perf_sched {
  99. struct perf_tool tool;
  100. const char *sort_order;
  101. unsigned long nr_tasks;
  102. struct task_desc **pid_to_task;
  103. struct task_desc **tasks;
  104. const struct trace_sched_handler *tp_handler;
  105. pthread_mutex_t start_work_mutex;
  106. pthread_mutex_t work_done_wait_mutex;
  107. int profile_cpu;
  108. /*
  109. * Track the current task - that way we can know whether there's any
  110. * weird events, such as a task being switched away that is not current.
  111. */
  112. int max_cpu;
  113. u32 curr_pid[MAX_CPUS];
  114. struct thread *curr_thread[MAX_CPUS];
  115. char next_shortname1;
  116. char next_shortname2;
  117. unsigned int replay_repeat;
  118. unsigned long nr_run_events;
  119. unsigned long nr_sleep_events;
  120. unsigned long nr_wakeup_events;
  121. unsigned long nr_sleep_corrections;
  122. unsigned long nr_run_events_optimized;
  123. unsigned long targetless_wakeups;
  124. unsigned long multitarget_wakeups;
  125. unsigned long nr_runs;
  126. unsigned long nr_timestamps;
  127. unsigned long nr_unordered_timestamps;
  128. unsigned long nr_context_switch_bugs;
  129. unsigned long nr_events;
  130. unsigned long nr_lost_chunks;
  131. unsigned long nr_lost_events;
  132. u64 run_measurement_overhead;
  133. u64 sleep_measurement_overhead;
  134. u64 start_time;
  135. u64 cpu_usage;
  136. u64 runavg_cpu_usage;
  137. u64 parent_cpu_usage;
  138. u64 runavg_parent_cpu_usage;
  139. u64 sum_runtime;
  140. u64 sum_fluct;
  141. u64 run_avg;
  142. u64 all_runtime;
  143. u64 all_count;
  144. u64 cpu_last_switched[MAX_CPUS];
  145. struct rb_root atom_root, sorted_atom_root;
  146. struct list_head sort_list, cmp_pid;
  147. bool force;
  148. };
  149. static u64 get_nsecs(void)
  150. {
  151. struct timespec ts;
  152. clock_gettime(CLOCK_MONOTONIC, &ts);
  153. return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
  154. }
  155. static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
  156. {
  157. u64 T0 = get_nsecs(), T1;
  158. do {
  159. T1 = get_nsecs();
  160. } while (T1 + sched->run_measurement_overhead < T0 + nsecs);
  161. }
  162. static void sleep_nsecs(u64 nsecs)
  163. {
  164. struct timespec ts;
  165. ts.tv_nsec = nsecs % 999999999;
  166. ts.tv_sec = nsecs / 999999999;
  167. nanosleep(&ts, NULL);
  168. }
  169. static void calibrate_run_measurement_overhead(struct perf_sched *sched)
  170. {
  171. u64 T0, T1, delta, min_delta = 1000000000ULL;
  172. int i;
  173. for (i = 0; i < 10; i++) {
  174. T0 = get_nsecs();
  175. burn_nsecs(sched, 0);
  176. T1 = get_nsecs();
  177. delta = T1-T0;
  178. min_delta = min(min_delta, delta);
  179. }
  180. sched->run_measurement_overhead = min_delta;
  181. printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
  182. }
  183. static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
  184. {
  185. u64 T0, T1, delta, min_delta = 1000000000ULL;
  186. int i;
  187. for (i = 0; i < 10; i++) {
  188. T0 = get_nsecs();
  189. sleep_nsecs(10000);
  190. T1 = get_nsecs();
  191. delta = T1-T0;
  192. min_delta = min(min_delta, delta);
  193. }
  194. min_delta -= 10000;
  195. sched->sleep_measurement_overhead = min_delta;
  196. printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
  197. }
  198. static struct sched_atom *
  199. get_new_event(struct task_desc *task, u64 timestamp)
  200. {
  201. struct sched_atom *event = zalloc(sizeof(*event));
  202. unsigned long idx = task->nr_events;
  203. size_t size;
  204. event->timestamp = timestamp;
  205. event->nr = idx;
  206. task->nr_events++;
  207. size = sizeof(struct sched_atom *) * task->nr_events;
  208. task->atoms = realloc(task->atoms, size);
  209. BUG_ON(!task->atoms);
  210. task->atoms[idx] = event;
  211. return event;
  212. }
  213. static struct sched_atom *last_event(struct task_desc *task)
  214. {
  215. if (!task->nr_events)
  216. return NULL;
  217. return task->atoms[task->nr_events - 1];
  218. }
  219. static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
  220. u64 timestamp, u64 duration)
  221. {
  222. struct sched_atom *event, *curr_event = last_event(task);
  223. /*
  224. * optimize an existing RUN event by merging this one
  225. * to it:
  226. */
  227. if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
  228. sched->nr_run_events_optimized++;
  229. curr_event->duration += duration;
  230. return;
  231. }
  232. event = get_new_event(task, timestamp);
  233. event->type = SCHED_EVENT_RUN;
  234. event->duration = duration;
  235. sched->nr_run_events++;
  236. }
  237. static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
  238. u64 timestamp, struct task_desc *wakee)
  239. {
  240. struct sched_atom *event, *wakee_event;
  241. event = get_new_event(task, timestamp);
  242. event->type = SCHED_EVENT_WAKEUP;
  243. event->wakee = wakee;
  244. wakee_event = last_event(wakee);
  245. if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
  246. sched->targetless_wakeups++;
  247. return;
  248. }
  249. if (wakee_event->wait_sem) {
  250. sched->multitarget_wakeups++;
  251. return;
  252. }
  253. wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
  254. sem_init(wakee_event->wait_sem, 0, 0);
  255. wakee_event->specific_wait = 1;
  256. event->wait_sem = wakee_event->wait_sem;
  257. sched->nr_wakeup_events++;
  258. }
  259. static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
  260. u64 timestamp, u64 task_state __maybe_unused)
  261. {
  262. struct sched_atom *event = get_new_event(task, timestamp);
  263. event->type = SCHED_EVENT_SLEEP;
  264. sched->nr_sleep_events++;
  265. }
  266. static struct task_desc *register_pid(struct perf_sched *sched,
  267. unsigned long pid, const char *comm)
  268. {
  269. struct task_desc *task;
  270. static int pid_max;
  271. if (sched->pid_to_task == NULL) {
  272. if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
  273. pid_max = MAX_PID;
  274. BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
  275. }
  276. if (pid >= (unsigned long)pid_max) {
  277. BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
  278. sizeof(struct task_desc *))) == NULL);
  279. while (pid >= (unsigned long)pid_max)
  280. sched->pid_to_task[pid_max++] = NULL;
  281. }
  282. task = sched->pid_to_task[pid];
  283. if (task)
  284. return task;
  285. task = zalloc(sizeof(*task));
  286. task->pid = pid;
  287. task->nr = sched->nr_tasks;
  288. strcpy(task->comm, comm);
  289. /*
  290. * every task starts in sleeping state - this gets ignored
  291. * if there's no wakeup pointing to this sleep state:
  292. */
  293. add_sched_event_sleep(sched, task, 0, 0);
  294. sched->pid_to_task[pid] = task;
  295. sched->nr_tasks++;
  296. sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
  297. BUG_ON(!sched->tasks);
  298. sched->tasks[task->nr] = task;
  299. if (verbose)
  300. printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
  301. return task;
  302. }
  303. static void print_task_traces(struct perf_sched *sched)
  304. {
  305. struct task_desc *task;
  306. unsigned long i;
  307. for (i = 0; i < sched->nr_tasks; i++) {
  308. task = sched->tasks[i];
  309. printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
  310. task->nr, task->comm, task->pid, task->nr_events);
  311. }
  312. }
  313. static void add_cross_task_wakeups(struct perf_sched *sched)
  314. {
  315. struct task_desc *task1, *task2;
  316. unsigned long i, j;
  317. for (i = 0; i < sched->nr_tasks; i++) {
  318. task1 = sched->tasks[i];
  319. j = i + 1;
  320. if (j == sched->nr_tasks)
  321. j = 0;
  322. task2 = sched->tasks[j];
  323. add_sched_event_wakeup(sched, task1, 0, task2);
  324. }
  325. }
  326. static void perf_sched__process_event(struct perf_sched *sched,
  327. struct sched_atom *atom)
  328. {
  329. int ret = 0;
  330. switch (atom->type) {
  331. case SCHED_EVENT_RUN:
  332. burn_nsecs(sched, atom->duration);
  333. break;
  334. case SCHED_EVENT_SLEEP:
  335. if (atom->wait_sem)
  336. ret = sem_wait(atom->wait_sem);
  337. BUG_ON(ret);
  338. break;
  339. case SCHED_EVENT_WAKEUP:
  340. if (atom->wait_sem)
  341. ret = sem_post(atom->wait_sem);
  342. BUG_ON(ret);
  343. break;
  344. case SCHED_EVENT_MIGRATION:
  345. break;
  346. default:
  347. BUG_ON(1);
  348. }
  349. }
  350. static u64 get_cpu_usage_nsec_parent(void)
  351. {
  352. struct rusage ru;
  353. u64 sum;
  354. int err;
  355. err = getrusage(RUSAGE_SELF, &ru);
  356. BUG_ON(err);
  357. sum = ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
  358. sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
  359. return sum;
  360. }
  361. static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
  362. {
  363. struct perf_event_attr attr;
  364. char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
  365. int fd;
  366. struct rlimit limit;
  367. bool need_privilege = false;
  368. memset(&attr, 0, sizeof(attr));
  369. attr.type = PERF_TYPE_SOFTWARE;
  370. attr.config = PERF_COUNT_SW_TASK_CLOCK;
  371. force_again:
  372. fd = sys_perf_event_open(&attr, 0, -1, -1,
  373. perf_event_open_cloexec_flag());
  374. if (fd < 0) {
  375. if (errno == EMFILE) {
  376. if (sched->force) {
  377. BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
  378. limit.rlim_cur += sched->nr_tasks - cur_task;
  379. if (limit.rlim_cur > limit.rlim_max) {
  380. limit.rlim_max = limit.rlim_cur;
  381. need_privilege = true;
  382. }
  383. if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
  384. if (need_privilege && errno == EPERM)
  385. strcpy(info, "Need privilege\n");
  386. } else
  387. goto force_again;
  388. } else
  389. strcpy(info, "Have a try with -f option\n");
  390. }
  391. pr_err("Error: sys_perf_event_open() syscall returned "
  392. "with %d (%s)\n%s", fd,
  393. strerror_r(errno, sbuf, sizeof(sbuf)), info);
  394. exit(EXIT_FAILURE);
  395. }
  396. return fd;
  397. }
  398. static u64 get_cpu_usage_nsec_self(int fd)
  399. {
  400. u64 runtime;
  401. int ret;
  402. ret = read(fd, &runtime, sizeof(runtime));
  403. BUG_ON(ret != sizeof(runtime));
  404. return runtime;
  405. }
  406. struct sched_thread_parms {
  407. struct task_desc *task;
  408. struct perf_sched *sched;
  409. int fd;
  410. };
  411. static void *thread_func(void *ctx)
  412. {
  413. struct sched_thread_parms *parms = ctx;
  414. struct task_desc *this_task = parms->task;
  415. struct perf_sched *sched = parms->sched;
  416. u64 cpu_usage_0, cpu_usage_1;
  417. unsigned long i, ret;
  418. char comm2[22];
  419. int fd = parms->fd;
  420. zfree(&parms);
  421. sprintf(comm2, ":%s", this_task->comm);
  422. prctl(PR_SET_NAME, comm2);
  423. if (fd < 0)
  424. return NULL;
  425. again:
  426. ret = sem_post(&this_task->ready_for_work);
  427. BUG_ON(ret);
  428. ret = pthread_mutex_lock(&sched->start_work_mutex);
  429. BUG_ON(ret);
  430. ret = pthread_mutex_unlock(&sched->start_work_mutex);
  431. BUG_ON(ret);
  432. cpu_usage_0 = get_cpu_usage_nsec_self(fd);
  433. for (i = 0; i < this_task->nr_events; i++) {
  434. this_task->curr_event = i;
  435. perf_sched__process_event(sched, this_task->atoms[i]);
  436. }
  437. cpu_usage_1 = get_cpu_usage_nsec_self(fd);
  438. this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
  439. ret = sem_post(&this_task->work_done_sem);
  440. BUG_ON(ret);
  441. ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
  442. BUG_ON(ret);
  443. ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
  444. BUG_ON(ret);
  445. goto again;
  446. }
  447. static void create_tasks(struct perf_sched *sched)
  448. {
  449. struct task_desc *task;
  450. pthread_attr_t attr;
  451. unsigned long i;
  452. int err;
  453. err = pthread_attr_init(&attr);
  454. BUG_ON(err);
  455. err = pthread_attr_setstacksize(&attr,
  456. (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
  457. BUG_ON(err);
  458. err = pthread_mutex_lock(&sched->start_work_mutex);
  459. BUG_ON(err);
  460. err = pthread_mutex_lock(&sched->work_done_wait_mutex);
  461. BUG_ON(err);
  462. for (i = 0; i < sched->nr_tasks; i++) {
  463. struct sched_thread_parms *parms = malloc(sizeof(*parms));
  464. BUG_ON(parms == NULL);
  465. parms->task = task = sched->tasks[i];
  466. parms->sched = sched;
  467. parms->fd = self_open_counters(sched, i);
  468. sem_init(&task->sleep_sem, 0, 0);
  469. sem_init(&task->ready_for_work, 0, 0);
  470. sem_init(&task->work_done_sem, 0, 0);
  471. task->curr_event = 0;
  472. err = pthread_create(&task->thread, &attr, thread_func, parms);
  473. BUG_ON(err);
  474. }
  475. }
  476. static void wait_for_tasks(struct perf_sched *sched)
  477. {
  478. u64 cpu_usage_0, cpu_usage_1;
  479. struct task_desc *task;
  480. unsigned long i, ret;
  481. sched->start_time = get_nsecs();
  482. sched->cpu_usage = 0;
  483. pthread_mutex_unlock(&sched->work_done_wait_mutex);
  484. for (i = 0; i < sched->nr_tasks; i++) {
  485. task = sched->tasks[i];
  486. ret = sem_wait(&task->ready_for_work);
  487. BUG_ON(ret);
  488. sem_init(&task->ready_for_work, 0, 0);
  489. }
  490. ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
  491. BUG_ON(ret);
  492. cpu_usage_0 = get_cpu_usage_nsec_parent();
  493. pthread_mutex_unlock(&sched->start_work_mutex);
  494. for (i = 0; i < sched->nr_tasks; i++) {
  495. task = sched->tasks[i];
  496. ret = sem_wait(&task->work_done_sem);
  497. BUG_ON(ret);
  498. sem_init(&task->work_done_sem, 0, 0);
  499. sched->cpu_usage += task->cpu_usage;
  500. task->cpu_usage = 0;
  501. }
  502. cpu_usage_1 = get_cpu_usage_nsec_parent();
  503. if (!sched->runavg_cpu_usage)
  504. sched->runavg_cpu_usage = sched->cpu_usage;
  505. sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
  506. sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
  507. if (!sched->runavg_parent_cpu_usage)
  508. sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
  509. sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
  510. sched->parent_cpu_usage)/sched->replay_repeat;
  511. ret = pthread_mutex_lock(&sched->start_work_mutex);
  512. BUG_ON(ret);
  513. for (i = 0; i < sched->nr_tasks; i++) {
  514. task = sched->tasks[i];
  515. sem_init(&task->sleep_sem, 0, 0);
  516. task->curr_event = 0;
  517. }
  518. }
  519. static void run_one_test(struct perf_sched *sched)
  520. {
  521. u64 T0, T1, delta, avg_delta, fluct;
  522. T0 = get_nsecs();
  523. wait_for_tasks(sched);
  524. T1 = get_nsecs();
  525. delta = T1 - T0;
  526. sched->sum_runtime += delta;
  527. sched->nr_runs++;
  528. avg_delta = sched->sum_runtime / sched->nr_runs;
  529. if (delta < avg_delta)
  530. fluct = avg_delta - delta;
  531. else
  532. fluct = delta - avg_delta;
  533. sched->sum_fluct += fluct;
  534. if (!sched->run_avg)
  535. sched->run_avg = delta;
  536. sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
  537. printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
  538. printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
  539. printf("cpu: %0.2f / %0.2f",
  540. (double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
  541. #if 0
  542. /*
  543. * rusage statistics done by the parent, these are less
  544. * accurate than the sched->sum_exec_runtime based statistics:
  545. */
  546. printf(" [%0.2f / %0.2f]",
  547. (double)sched->parent_cpu_usage/1e6,
  548. (double)sched->runavg_parent_cpu_usage/1e6);
  549. #endif
  550. printf("\n");
  551. if (sched->nr_sleep_corrections)
  552. printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
  553. sched->nr_sleep_corrections = 0;
  554. }
  555. static void test_calibrations(struct perf_sched *sched)
  556. {
  557. u64 T0, T1;
  558. T0 = get_nsecs();
  559. burn_nsecs(sched, 1e6);
  560. T1 = get_nsecs();
  561. printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
  562. T0 = get_nsecs();
  563. sleep_nsecs(1e6);
  564. T1 = get_nsecs();
  565. printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
  566. }
  567. static int
  568. replay_wakeup_event(struct perf_sched *sched,
  569. struct perf_evsel *evsel, struct perf_sample *sample,
  570. struct machine *machine __maybe_unused)
  571. {
  572. const char *comm = perf_evsel__strval(evsel, sample, "comm");
  573. const u32 pid = perf_evsel__intval(evsel, sample, "pid");
  574. struct task_desc *waker, *wakee;
  575. if (verbose) {
  576. printf("sched_wakeup event %p\n", evsel);
  577. printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
  578. }
  579. waker = register_pid(sched, sample->tid, "<unknown>");
  580. wakee = register_pid(sched, pid, comm);
  581. add_sched_event_wakeup(sched, waker, sample->time, wakee);
  582. return 0;
  583. }
  584. static int replay_switch_event(struct perf_sched *sched,
  585. struct perf_evsel *evsel,
  586. struct perf_sample *sample,
  587. struct machine *machine __maybe_unused)
  588. {
  589. const char *prev_comm = perf_evsel__strval(evsel, sample, "prev_comm"),
  590. *next_comm = perf_evsel__strval(evsel, sample, "next_comm");
  591. const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
  592. next_pid = perf_evsel__intval(evsel, sample, "next_pid");
  593. const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
  594. struct task_desc *prev, __maybe_unused *next;
  595. u64 timestamp0, timestamp = sample->time;
  596. int cpu = sample->cpu;
  597. s64 delta;
  598. if (verbose)
  599. printf("sched_switch event %p\n", evsel);
  600. if (cpu >= MAX_CPUS || cpu < 0)
  601. return 0;
  602. timestamp0 = sched->cpu_last_switched[cpu];
  603. if (timestamp0)
  604. delta = timestamp - timestamp0;
  605. else
  606. delta = 0;
  607. if (delta < 0) {
  608. pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
  609. return -1;
  610. }
  611. pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
  612. prev_comm, prev_pid, next_comm, next_pid, delta);
  613. prev = register_pid(sched, prev_pid, prev_comm);
  614. next = register_pid(sched, next_pid, next_comm);
  615. sched->cpu_last_switched[cpu] = timestamp;
  616. add_sched_event_run(sched, prev, timestamp, delta);
  617. add_sched_event_sleep(sched, prev, timestamp, prev_state);
  618. return 0;
  619. }
  620. static int replay_fork_event(struct perf_sched *sched,
  621. union perf_event *event,
  622. struct machine *machine)
  623. {
  624. struct thread *child, *parent;
  625. child = machine__findnew_thread(machine, event->fork.pid,
  626. event->fork.tid);
  627. parent = machine__findnew_thread(machine, event->fork.ppid,
  628. event->fork.ptid);
  629. if (child == NULL || parent == NULL) {
  630. pr_debug("thread does not exist on fork event: child %p, parent %p\n",
  631. child, parent);
  632. return 0;
  633. }
  634. if (verbose) {
  635. printf("fork event\n");
  636. printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
  637. printf("... child: %s/%d\n", thread__comm_str(child), child->tid);
  638. }
  639. register_pid(sched, parent->tid, thread__comm_str(parent));
  640. register_pid(sched, child->tid, thread__comm_str(child));
  641. return 0;
  642. }
  643. struct sort_dimension {
  644. const char *name;
  645. sort_fn_t cmp;
  646. struct list_head list;
  647. };
  648. static int
  649. thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
  650. {
  651. struct sort_dimension *sort;
  652. int ret = 0;
  653. BUG_ON(list_empty(list));
  654. list_for_each_entry(sort, list, list) {
  655. ret = sort->cmp(l, r);
  656. if (ret)
  657. return ret;
  658. }
  659. return ret;
  660. }
  661. static struct work_atoms *
  662. thread_atoms_search(struct rb_root *root, struct thread *thread,
  663. struct list_head *sort_list)
  664. {
  665. struct rb_node *node = root->rb_node;
  666. struct work_atoms key = { .thread = thread };
  667. while (node) {
  668. struct work_atoms *atoms;
  669. int cmp;
  670. atoms = container_of(node, struct work_atoms, node);
  671. cmp = thread_lat_cmp(sort_list, &key, atoms);
  672. if (cmp > 0)
  673. node = node->rb_left;
  674. else if (cmp < 0)
  675. node = node->rb_right;
  676. else {
  677. BUG_ON(thread != atoms->thread);
  678. return atoms;
  679. }
  680. }
  681. return NULL;
  682. }
  683. static void
  684. __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
  685. struct list_head *sort_list)
  686. {
  687. struct rb_node **new = &(root->rb_node), *parent = NULL;
  688. while (*new) {
  689. struct work_atoms *this;
  690. int cmp;
  691. this = container_of(*new, struct work_atoms, node);
  692. parent = *new;
  693. cmp = thread_lat_cmp(sort_list, data, this);
  694. if (cmp > 0)
  695. new = &((*new)->rb_left);
  696. else
  697. new = &((*new)->rb_right);
  698. }
  699. rb_link_node(&data->node, parent, new);
  700. rb_insert_color(&data->node, root);
  701. }
  702. static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
  703. {
  704. struct work_atoms *atoms = zalloc(sizeof(*atoms));
  705. if (!atoms) {
  706. pr_err("No memory at %s\n", __func__);
  707. return -1;
  708. }
  709. atoms->thread = thread__get(thread);
  710. INIT_LIST_HEAD(&atoms->work_list);
  711. __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
  712. return 0;
  713. }
  714. static char sched_out_state(u64 prev_state)
  715. {
  716. const char *str = TASK_STATE_TO_CHAR_STR;
  717. return str[prev_state];
  718. }
  719. static int
  720. add_sched_out_event(struct work_atoms *atoms,
  721. char run_state,
  722. u64 timestamp)
  723. {
  724. struct work_atom *atom = zalloc(sizeof(*atom));
  725. if (!atom) {
  726. pr_err("Non memory at %s", __func__);
  727. return -1;
  728. }
  729. atom->sched_out_time = timestamp;
  730. if (run_state == 'R') {
  731. atom->state = THREAD_WAIT_CPU;
  732. atom->wake_up_time = atom->sched_out_time;
  733. }
  734. list_add_tail(&atom->list, &atoms->work_list);
  735. return 0;
  736. }
  737. static void
  738. add_runtime_event(struct work_atoms *atoms, u64 delta,
  739. u64 timestamp __maybe_unused)
  740. {
  741. struct work_atom *atom;
  742. BUG_ON(list_empty(&atoms->work_list));
  743. atom = list_entry(atoms->work_list.prev, struct work_atom, list);
  744. atom->runtime += delta;
  745. atoms->total_runtime += delta;
  746. }
  747. static void
  748. add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
  749. {
  750. struct work_atom *atom;
  751. u64 delta;
  752. if (list_empty(&atoms->work_list))
  753. return;
  754. atom = list_entry(atoms->work_list.prev, struct work_atom, list);
  755. if (atom->state != THREAD_WAIT_CPU)
  756. return;
  757. if (timestamp < atom->wake_up_time) {
  758. atom->state = THREAD_IGNORE;
  759. return;
  760. }
  761. atom->state = THREAD_SCHED_IN;
  762. atom->sched_in_time = timestamp;
  763. delta = atom->sched_in_time - atom->wake_up_time;
  764. atoms->total_lat += delta;
  765. if (delta > atoms->max_lat) {
  766. atoms->max_lat = delta;
  767. atoms->max_lat_at = timestamp;
  768. }
  769. atoms->nb_atoms++;
  770. }
  771. static int latency_switch_event(struct perf_sched *sched,
  772. struct perf_evsel *evsel,
  773. struct perf_sample *sample,
  774. struct machine *machine)
  775. {
  776. const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
  777. next_pid = perf_evsel__intval(evsel, sample, "next_pid");
  778. const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
  779. struct work_atoms *out_events, *in_events;
  780. struct thread *sched_out, *sched_in;
  781. u64 timestamp0, timestamp = sample->time;
  782. int cpu = sample->cpu;
  783. s64 delta;
  784. BUG_ON(cpu >= MAX_CPUS || cpu < 0);
  785. timestamp0 = sched->cpu_last_switched[cpu];
  786. sched->cpu_last_switched[cpu] = timestamp;
  787. if (timestamp0)
  788. delta = timestamp - timestamp0;
  789. else
  790. delta = 0;
  791. if (delta < 0) {
  792. pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
  793. return -1;
  794. }
  795. sched_out = machine__findnew_thread(machine, -1, prev_pid);
  796. sched_in = machine__findnew_thread(machine, -1, next_pid);
  797. out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
  798. if (!out_events) {
  799. if (thread_atoms_insert(sched, sched_out))
  800. return -1;
  801. out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
  802. if (!out_events) {
  803. pr_err("out-event: Internal tree error");
  804. return -1;
  805. }
  806. }
  807. if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
  808. return -1;
  809. in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
  810. if (!in_events) {
  811. if (thread_atoms_insert(sched, sched_in))
  812. return -1;
  813. in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
  814. if (!in_events) {
  815. pr_err("in-event: Internal tree error");
  816. return -1;
  817. }
  818. /*
  819. * Take came in we have not heard about yet,
  820. * add in an initial atom in runnable state:
  821. */
  822. if (add_sched_out_event(in_events, 'R', timestamp))
  823. return -1;
  824. }
  825. add_sched_in_event(in_events, timestamp);
  826. return 0;
  827. }
  828. static int latency_runtime_event(struct perf_sched *sched,
  829. struct perf_evsel *evsel,
  830. struct perf_sample *sample,
  831. struct machine *machine)
  832. {
  833. const u32 pid = perf_evsel__intval(evsel, sample, "pid");
  834. const u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
  835. struct thread *thread = machine__findnew_thread(machine, -1, pid);
  836. struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
  837. u64 timestamp = sample->time;
  838. int cpu = sample->cpu;
  839. BUG_ON(cpu >= MAX_CPUS || cpu < 0);
  840. if (!atoms) {
  841. if (thread_atoms_insert(sched, thread))
  842. return -1;
  843. atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
  844. if (!atoms) {
  845. pr_err("in-event: Internal tree error");
  846. return -1;
  847. }
  848. if (add_sched_out_event(atoms, 'R', timestamp))
  849. return -1;
  850. }
  851. add_runtime_event(atoms, runtime, timestamp);
  852. return 0;
  853. }
  854. static int latency_wakeup_event(struct perf_sched *sched,
  855. struct perf_evsel *evsel,
  856. struct perf_sample *sample,
  857. struct machine *machine)
  858. {
  859. const u32 pid = perf_evsel__intval(evsel, sample, "pid");
  860. struct work_atoms *atoms;
  861. struct work_atom *atom;
  862. struct thread *wakee;
  863. u64 timestamp = sample->time;
  864. wakee = machine__findnew_thread(machine, -1, pid);
  865. atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
  866. if (!atoms) {
  867. if (thread_atoms_insert(sched, wakee))
  868. return -1;
  869. atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
  870. if (!atoms) {
  871. pr_err("wakeup-event: Internal tree error");
  872. return -1;
  873. }
  874. if (add_sched_out_event(atoms, 'S', timestamp))
  875. return -1;
  876. }
  877. BUG_ON(list_empty(&atoms->work_list));
  878. atom = list_entry(atoms->work_list.prev, struct work_atom, list);
  879. /*
  880. * As we do not guarantee the wakeup event happens when
  881. * task is out of run queue, also may happen when task is
  882. * on run queue and wakeup only change ->state to TASK_RUNNING,
  883. * then we should not set the ->wake_up_time when wake up a
  884. * task which is on run queue.
  885. *
  886. * You WILL be missing events if you've recorded only
  887. * one CPU, or are only looking at only one, so don't
  888. * skip in this case.
  889. */
  890. if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
  891. return 0;
  892. sched->nr_timestamps++;
  893. if (atom->sched_out_time > timestamp) {
  894. sched->nr_unordered_timestamps++;
  895. return 0;
  896. }
  897. atom->state = THREAD_WAIT_CPU;
  898. atom->wake_up_time = timestamp;
  899. return 0;
  900. }
  901. static int latency_migrate_task_event(struct perf_sched *sched,
  902. struct perf_evsel *evsel,
  903. struct perf_sample *sample,
  904. struct machine *machine)
  905. {
  906. const u32 pid = perf_evsel__intval(evsel, sample, "pid");
  907. u64 timestamp = sample->time;
  908. struct work_atoms *atoms;
  909. struct work_atom *atom;
  910. struct thread *migrant;
  911. /*
  912. * Only need to worry about migration when profiling one CPU.
  913. */
  914. if (sched->profile_cpu == -1)
  915. return 0;
  916. migrant = machine__findnew_thread(machine, -1, pid);
  917. atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
  918. if (!atoms) {
  919. if (thread_atoms_insert(sched, migrant))
  920. return -1;
  921. register_pid(sched, migrant->tid, thread__comm_str(migrant));
  922. atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
  923. if (!atoms) {
  924. pr_err("migration-event: Internal tree error");
  925. return -1;
  926. }
  927. if (add_sched_out_event(atoms, 'R', timestamp))
  928. return -1;
  929. }
  930. BUG_ON(list_empty(&atoms->work_list));
  931. atom = list_entry(atoms->work_list.prev, struct work_atom, list);
  932. atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
  933. sched->nr_timestamps++;
  934. if (atom->sched_out_time > timestamp)
  935. sched->nr_unordered_timestamps++;
  936. return 0;
  937. }
  938. static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
  939. {
  940. int i;
  941. int ret;
  942. u64 avg;
  943. if (!work_list->nb_atoms)
  944. return;
  945. /*
  946. * Ignore idle threads:
  947. */
  948. if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
  949. return;
  950. sched->all_runtime += work_list->total_runtime;
  951. sched->all_count += work_list->nb_atoms;
  952. ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
  953. for (i = 0; i < 24 - ret; i++)
  954. printf(" ");
  955. avg = work_list->total_lat / work_list->nb_atoms;
  956. printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
  957. (double)work_list->total_runtime / 1e6,
  958. work_list->nb_atoms, (double)avg / 1e6,
  959. (double)work_list->max_lat / 1e6,
  960. (double)work_list->max_lat_at / 1e9);
  961. }
  962. static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
  963. {
  964. if (l->thread->tid < r->thread->tid)
  965. return -1;
  966. if (l->thread->tid > r->thread->tid)
  967. return 1;
  968. return 0;
  969. }
  970. static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
  971. {
  972. u64 avgl, avgr;
  973. if (!l->nb_atoms)
  974. return -1;
  975. if (!r->nb_atoms)
  976. return 1;
  977. avgl = l->total_lat / l->nb_atoms;
  978. avgr = r->total_lat / r->nb_atoms;
  979. if (avgl < avgr)
  980. return -1;
  981. if (avgl > avgr)
  982. return 1;
  983. return 0;
  984. }
  985. static int max_cmp(struct work_atoms *l, struct work_atoms *r)
  986. {
  987. if (l->max_lat < r->max_lat)
  988. return -1;
  989. if (l->max_lat > r->max_lat)
  990. return 1;
  991. return 0;
  992. }
  993. static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
  994. {
  995. if (l->nb_atoms < r->nb_atoms)
  996. return -1;
  997. if (l->nb_atoms > r->nb_atoms)
  998. return 1;
  999. return 0;
  1000. }
  1001. static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
  1002. {
  1003. if (l->total_runtime < r->total_runtime)
  1004. return -1;
  1005. if (l->total_runtime > r->total_runtime)
  1006. return 1;
  1007. return 0;
  1008. }
  1009. static int sort_dimension__add(const char *tok, struct list_head *list)
  1010. {
  1011. size_t i;
  1012. static struct sort_dimension avg_sort_dimension = {
  1013. .name = "avg",
  1014. .cmp = avg_cmp,
  1015. };
  1016. static struct sort_dimension max_sort_dimension = {
  1017. .name = "max",
  1018. .cmp = max_cmp,
  1019. };
  1020. static struct sort_dimension pid_sort_dimension = {
  1021. .name = "pid",
  1022. .cmp = pid_cmp,
  1023. };
  1024. static struct sort_dimension runtime_sort_dimension = {
  1025. .name = "runtime",
  1026. .cmp = runtime_cmp,
  1027. };
  1028. static struct sort_dimension switch_sort_dimension = {
  1029. .name = "switch",
  1030. .cmp = switch_cmp,
  1031. };
  1032. struct sort_dimension *available_sorts[] = {
  1033. &pid_sort_dimension,
  1034. &avg_sort_dimension,
  1035. &max_sort_dimension,
  1036. &switch_sort_dimension,
  1037. &runtime_sort_dimension,
  1038. };
  1039. for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
  1040. if (!strcmp(available_sorts[i]->name, tok)) {
  1041. list_add_tail(&available_sorts[i]->list, list);
  1042. return 0;
  1043. }
  1044. }
  1045. return -1;
  1046. }
  1047. static void perf_sched__sort_lat(struct perf_sched *sched)
  1048. {
  1049. struct rb_node *node;
  1050. for (;;) {
  1051. struct work_atoms *data;
  1052. node = rb_first(&sched->atom_root);
  1053. if (!node)
  1054. break;
  1055. rb_erase(node, &sched->atom_root);
  1056. data = rb_entry(node, struct work_atoms, node);
  1057. __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
  1058. }
  1059. }
  1060. static int process_sched_wakeup_event(struct perf_tool *tool,
  1061. struct perf_evsel *evsel,
  1062. struct perf_sample *sample,
  1063. struct machine *machine)
  1064. {
  1065. struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
  1066. if (sched->tp_handler->wakeup_event)
  1067. return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
  1068. return 0;
  1069. }
  1070. static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
  1071. struct perf_sample *sample, struct machine *machine)
  1072. {
  1073. const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
  1074. struct thread *sched_in;
  1075. int new_shortname;
  1076. u64 timestamp0, timestamp = sample->time;
  1077. s64 delta;
  1078. int cpu, this_cpu = sample->cpu;
  1079. BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
  1080. if (this_cpu > sched->max_cpu)
  1081. sched->max_cpu = this_cpu;
  1082. timestamp0 = sched->cpu_last_switched[this_cpu];
  1083. sched->cpu_last_switched[this_cpu] = timestamp;
  1084. if (timestamp0)
  1085. delta = timestamp - timestamp0;
  1086. else
  1087. delta = 0;
  1088. if (delta < 0) {
  1089. pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
  1090. return -1;
  1091. }
  1092. sched_in = machine__findnew_thread(machine, -1, next_pid);
  1093. sched->curr_thread[this_cpu] = sched_in;
  1094. printf(" ");
  1095. new_shortname = 0;
  1096. if (!sched_in->shortname[0]) {
  1097. if (!strcmp(thread__comm_str(sched_in), "swapper")) {
  1098. /*
  1099. * Don't allocate a letter-number for swapper:0
  1100. * as a shortname. Instead, we use '.' for it.
  1101. */
  1102. sched_in->shortname[0] = '.';
  1103. sched_in->shortname[1] = ' ';
  1104. } else {
  1105. sched_in->shortname[0] = sched->next_shortname1;
  1106. sched_in->shortname[1] = sched->next_shortname2;
  1107. if (sched->next_shortname1 < 'Z') {
  1108. sched->next_shortname1++;
  1109. } else {
  1110. sched->next_shortname1 = 'A';
  1111. if (sched->next_shortname2 < '9')
  1112. sched->next_shortname2++;
  1113. else
  1114. sched->next_shortname2 = '0';
  1115. }
  1116. }
  1117. new_shortname = 1;
  1118. }
  1119. for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
  1120. if (cpu != this_cpu)
  1121. printf(" ");
  1122. else
  1123. printf("*");
  1124. if (sched->curr_thread[cpu])
  1125. printf("%2s ", sched->curr_thread[cpu]->shortname);
  1126. else
  1127. printf(" ");
  1128. }
  1129. printf(" %12.6f secs ", (double)timestamp/1e9);
  1130. if (new_shortname) {
  1131. printf("%s => %s:%d\n",
  1132. sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
  1133. } else {
  1134. printf("\n");
  1135. }
  1136. return 0;
  1137. }
  1138. static int process_sched_switch_event(struct perf_tool *tool,
  1139. struct perf_evsel *evsel,
  1140. struct perf_sample *sample,
  1141. struct machine *machine)
  1142. {
  1143. struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
  1144. int this_cpu = sample->cpu, err = 0;
  1145. u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
  1146. next_pid = perf_evsel__intval(evsel, sample, "next_pid");
  1147. if (sched->curr_pid[this_cpu] != (u32)-1) {
  1148. /*
  1149. * Are we trying to switch away a PID that is
  1150. * not current?
  1151. */
  1152. if (sched->curr_pid[this_cpu] != prev_pid)
  1153. sched->nr_context_switch_bugs++;
  1154. }
  1155. if (sched->tp_handler->switch_event)
  1156. err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
  1157. sched->curr_pid[this_cpu] = next_pid;
  1158. return err;
  1159. }
  1160. static int process_sched_runtime_event(struct perf_tool *tool,
  1161. struct perf_evsel *evsel,
  1162. struct perf_sample *sample,
  1163. struct machine *machine)
  1164. {
  1165. struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
  1166. if (sched->tp_handler->runtime_event)
  1167. return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
  1168. return 0;
  1169. }
  1170. static int perf_sched__process_fork_event(struct perf_tool *tool,
  1171. union perf_event *event,
  1172. struct perf_sample *sample,
  1173. struct machine *machine)
  1174. {
  1175. struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
  1176. /* run the fork event through the perf machineruy */
  1177. perf_event__process_fork(tool, event, sample, machine);
  1178. /* and then run additional processing needed for this command */
  1179. if (sched->tp_handler->fork_event)
  1180. return sched->tp_handler->fork_event(sched, event, machine);
  1181. return 0;
  1182. }
  1183. static int process_sched_migrate_task_event(struct perf_tool *tool,
  1184. struct perf_evsel *evsel,
  1185. struct perf_sample *sample,
  1186. struct machine *machine)
  1187. {
  1188. struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
  1189. if (sched->tp_handler->migrate_task_event)
  1190. return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
  1191. return 0;
  1192. }
  1193. typedef int (*tracepoint_handler)(struct perf_tool *tool,
  1194. struct perf_evsel *evsel,
  1195. struct perf_sample *sample,
  1196. struct machine *machine);
  1197. static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
  1198. union perf_event *event __maybe_unused,
  1199. struct perf_sample *sample,
  1200. struct perf_evsel *evsel,
  1201. struct machine *machine)
  1202. {
  1203. int err = 0;
  1204. if (evsel->handler != NULL) {
  1205. tracepoint_handler f = evsel->handler;
  1206. err = f(tool, evsel, sample, machine);
  1207. }
  1208. return err;
  1209. }
  1210. static int perf_sched__read_events(struct perf_sched *sched)
  1211. {
  1212. const struct perf_evsel_str_handler handlers[] = {
  1213. { "sched:sched_switch", process_sched_switch_event, },
  1214. { "sched:sched_stat_runtime", process_sched_runtime_event, },
  1215. { "sched:sched_wakeup", process_sched_wakeup_event, },
  1216. { "sched:sched_wakeup_new", process_sched_wakeup_event, },
  1217. { "sched:sched_migrate_task", process_sched_migrate_task_event, },
  1218. };
  1219. struct perf_session *session;
  1220. struct perf_data_file file = {
  1221. .path = input_name,
  1222. .mode = PERF_DATA_MODE_READ,
  1223. .force = sched->force,
  1224. };
  1225. int rc = -1;
  1226. session = perf_session__new(&file, false, &sched->tool);
  1227. if (session == NULL) {
  1228. pr_debug("No Memory for session\n");
  1229. return -1;
  1230. }
  1231. symbol__init(&session->header.env);
  1232. if (perf_session__set_tracepoints_handlers(session, handlers))
  1233. goto out_delete;
  1234. if (perf_session__has_traces(session, "record -R")) {
  1235. int err = perf_session__process_events(session);
  1236. if (err) {
  1237. pr_err("Failed to process events, error %d", err);
  1238. goto out_delete;
  1239. }
  1240. sched->nr_events = session->evlist->stats.nr_events[0];
  1241. sched->nr_lost_events = session->evlist->stats.total_lost;
  1242. sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
  1243. }
  1244. rc = 0;
  1245. out_delete:
  1246. perf_session__delete(session);
  1247. return rc;
  1248. }
  1249. static void print_bad_events(struct perf_sched *sched)
  1250. {
  1251. if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
  1252. printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
  1253. (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
  1254. sched->nr_unordered_timestamps, sched->nr_timestamps);
  1255. }
  1256. if (sched->nr_lost_events && sched->nr_events) {
  1257. printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
  1258. (double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
  1259. sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
  1260. }
  1261. if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
  1262. printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)",
  1263. (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
  1264. sched->nr_context_switch_bugs, sched->nr_timestamps);
  1265. if (sched->nr_lost_events)
  1266. printf(" (due to lost events?)");
  1267. printf("\n");
  1268. }
  1269. }
  1270. static int perf_sched__lat(struct perf_sched *sched)
  1271. {
  1272. struct rb_node *next;
  1273. setup_pager();
  1274. if (perf_sched__read_events(sched))
  1275. return -1;
  1276. perf_sched__sort_lat(sched);
  1277. printf("\n -----------------------------------------------------------------------------------------------------------------\n");
  1278. printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n");
  1279. printf(" -----------------------------------------------------------------------------------------------------------------\n");
  1280. next = rb_first(&sched->sorted_atom_root);
  1281. while (next) {
  1282. struct work_atoms *work_list;
  1283. work_list = rb_entry(next, struct work_atoms, node);
  1284. output_lat_thread(sched, work_list);
  1285. next = rb_next(next);
  1286. thread__zput(work_list->thread);
  1287. }
  1288. printf(" -----------------------------------------------------------------------------------------------------------------\n");
  1289. printf(" TOTAL: |%11.3f ms |%9" PRIu64 " |\n",
  1290. (double)sched->all_runtime / 1e6, sched->all_count);
  1291. printf(" ---------------------------------------------------\n");
  1292. print_bad_events(sched);
  1293. printf("\n");
  1294. return 0;
  1295. }
  1296. static int perf_sched__map(struct perf_sched *sched)
  1297. {
  1298. sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
  1299. setup_pager();
  1300. if (perf_sched__read_events(sched))
  1301. return -1;
  1302. print_bad_events(sched);
  1303. return 0;
  1304. }
  1305. static int perf_sched__replay(struct perf_sched *sched)
  1306. {
  1307. unsigned long i;
  1308. calibrate_run_measurement_overhead(sched);
  1309. calibrate_sleep_measurement_overhead(sched);
  1310. test_calibrations(sched);
  1311. if (perf_sched__read_events(sched))
  1312. return -1;
  1313. printf("nr_run_events: %ld\n", sched->nr_run_events);
  1314. printf("nr_sleep_events: %ld\n", sched->nr_sleep_events);
  1315. printf("nr_wakeup_events: %ld\n", sched->nr_wakeup_events);
  1316. if (sched->targetless_wakeups)
  1317. printf("target-less wakeups: %ld\n", sched->targetless_wakeups);
  1318. if (sched->multitarget_wakeups)
  1319. printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
  1320. if (sched->nr_run_events_optimized)
  1321. printf("run atoms optimized: %ld\n",
  1322. sched->nr_run_events_optimized);
  1323. print_task_traces(sched);
  1324. add_cross_task_wakeups(sched);
  1325. create_tasks(sched);
  1326. printf("------------------------------------------------------------\n");
  1327. for (i = 0; i < sched->replay_repeat; i++)
  1328. run_one_test(sched);
  1329. return 0;
  1330. }
  1331. static void setup_sorting(struct perf_sched *sched, const struct option *options,
  1332. const char * const usage_msg[])
  1333. {
  1334. char *tmp, *tok, *str = strdup(sched->sort_order);
  1335. for (tok = strtok_r(str, ", ", &tmp);
  1336. tok; tok = strtok_r(NULL, ", ", &tmp)) {
  1337. if (sort_dimension__add(tok, &sched->sort_list) < 0) {
  1338. error("Unknown --sort key: `%s'", tok);
  1339. usage_with_options(usage_msg, options);
  1340. }
  1341. }
  1342. free(str);
  1343. sort_dimension__add("pid", &sched->cmp_pid);
  1344. }
  1345. static int __cmd_record(int argc, const char **argv)
  1346. {
  1347. unsigned int rec_argc, i, j;
  1348. const char **rec_argv;
  1349. const char * const record_args[] = {
  1350. "record",
  1351. "-a",
  1352. "-R",
  1353. "-m", "1024",
  1354. "-c", "1",
  1355. "-e", "sched:sched_switch",
  1356. "-e", "sched:sched_stat_wait",
  1357. "-e", "sched:sched_stat_sleep",
  1358. "-e", "sched:sched_stat_iowait",
  1359. "-e", "sched:sched_stat_runtime",
  1360. "-e", "sched:sched_process_fork",
  1361. "-e", "sched:sched_wakeup",
  1362. "-e", "sched:sched_wakeup_new",
  1363. "-e", "sched:sched_migrate_task",
  1364. };
  1365. rec_argc = ARRAY_SIZE(record_args) + argc - 1;
  1366. rec_argv = calloc(rec_argc + 1, sizeof(char *));
  1367. if (rec_argv == NULL)
  1368. return -ENOMEM;
  1369. for (i = 0; i < ARRAY_SIZE(record_args); i++)
  1370. rec_argv[i] = strdup(record_args[i]);
  1371. for (j = 1; j < (unsigned int)argc; j++, i++)
  1372. rec_argv[i] = argv[j];
  1373. BUG_ON(i != rec_argc);
  1374. return cmd_record(i, rec_argv, NULL);
  1375. }
  1376. int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
  1377. {
  1378. const char default_sort_order[] = "avg, max, switch, runtime";
  1379. struct perf_sched sched = {
  1380. .tool = {
  1381. .sample = perf_sched__process_tracepoint_sample,
  1382. .comm = perf_event__process_comm,
  1383. .lost = perf_event__process_lost,
  1384. .fork = perf_sched__process_fork_event,
  1385. .ordered_events = true,
  1386. },
  1387. .cmp_pid = LIST_HEAD_INIT(sched.cmp_pid),
  1388. .sort_list = LIST_HEAD_INIT(sched.sort_list),
  1389. .start_work_mutex = PTHREAD_MUTEX_INITIALIZER,
  1390. .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
  1391. .sort_order = default_sort_order,
  1392. .replay_repeat = 10,
  1393. .profile_cpu = -1,
  1394. .next_shortname1 = 'A',
  1395. .next_shortname2 = '0',
  1396. };
  1397. const struct option latency_options[] = {
  1398. OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
  1399. "sort by key(s): runtime, switch, avg, max"),
  1400. OPT_INCR('v', "verbose", &verbose,
  1401. "be more verbose (show symbol address, etc)"),
  1402. OPT_INTEGER('C', "CPU", &sched.profile_cpu,
  1403. "CPU to profile on"),
  1404. OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
  1405. "dump raw trace in ASCII"),
  1406. OPT_END()
  1407. };
  1408. const struct option replay_options[] = {
  1409. OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
  1410. "repeat the workload replay N times (-1: infinite)"),
  1411. OPT_INCR('v', "verbose", &verbose,
  1412. "be more verbose (show symbol address, etc)"),
  1413. OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
  1414. "dump raw trace in ASCII"),
  1415. OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
  1416. OPT_END()
  1417. };
  1418. const struct option sched_options[] = {
  1419. OPT_STRING('i', "input", &input_name, "file",
  1420. "input file name"),
  1421. OPT_INCR('v', "verbose", &verbose,
  1422. "be more verbose (show symbol address, etc)"),
  1423. OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
  1424. "dump raw trace in ASCII"),
  1425. OPT_END()
  1426. };
  1427. const char * const latency_usage[] = {
  1428. "perf sched latency [<options>]",
  1429. NULL
  1430. };
  1431. const char * const replay_usage[] = {
  1432. "perf sched replay [<options>]",
  1433. NULL
  1434. };
  1435. const char *const sched_subcommands[] = { "record", "latency", "map",
  1436. "replay", "script", NULL };
  1437. const char *sched_usage[] = {
  1438. NULL,
  1439. NULL
  1440. };
  1441. struct trace_sched_handler lat_ops = {
  1442. .wakeup_event = latency_wakeup_event,
  1443. .switch_event = latency_switch_event,
  1444. .runtime_event = latency_runtime_event,
  1445. .migrate_task_event = latency_migrate_task_event,
  1446. };
  1447. struct trace_sched_handler map_ops = {
  1448. .switch_event = map_switch_event,
  1449. };
  1450. struct trace_sched_handler replay_ops = {
  1451. .wakeup_event = replay_wakeup_event,
  1452. .switch_event = replay_switch_event,
  1453. .fork_event = replay_fork_event,
  1454. };
  1455. unsigned int i;
  1456. for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
  1457. sched.curr_pid[i] = -1;
  1458. argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
  1459. sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
  1460. if (!argc)
  1461. usage_with_options(sched_usage, sched_options);
  1462. /*
  1463. * Aliased to 'perf script' for now:
  1464. */
  1465. if (!strcmp(argv[0], "script"))
  1466. return cmd_script(argc, argv, prefix);
  1467. if (!strncmp(argv[0], "rec", 3)) {
  1468. return __cmd_record(argc, argv);
  1469. } else if (!strncmp(argv[0], "lat", 3)) {
  1470. sched.tp_handler = &lat_ops;
  1471. if (argc > 1) {
  1472. argc = parse_options(argc, argv, latency_options, latency_usage, 0);
  1473. if (argc)
  1474. usage_with_options(latency_usage, latency_options);
  1475. }
  1476. setup_sorting(&sched, latency_options, latency_usage);
  1477. return perf_sched__lat(&sched);
  1478. } else if (!strcmp(argv[0], "map")) {
  1479. sched.tp_handler = &map_ops;
  1480. setup_sorting(&sched, latency_options, latency_usage);
  1481. return perf_sched__map(&sched);
  1482. } else if (!strncmp(argv[0], "rep", 3)) {
  1483. sched.tp_handler = &replay_ops;
  1484. if (argc) {
  1485. argc = parse_options(argc, argv, replay_options, replay_usage, 0);
  1486. if (argc)
  1487. usage_with_options(replay_usage, replay_options);
  1488. }
  1489. return perf_sched__replay(&sched);
  1490. } else {
  1491. usage_with_options(sched_usage, sched_options);
  1492. }
  1493. return 0;
  1494. }