multipathd: fix reservation_key check
[multipath-tools/.git] / libmultipath / io_err_stat.c
1 /*
2  * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved.
3  *
4  * io_err_stat.c
5  * version 1.0
6  *
7  * IO error stream statistic process for path failure event from kernel
8  *
9  * Author(s): Guan Junxiong 2017 <guanjunxiong@huawei.com>
10  *
11  * This file is released under the GPL version 2, or any later version.
12  */
13
14 #include <unistd.h>
15 #include <pthread.h>
16 #include <signal.h>
17 #include <fcntl.h>
18 #include <sys/stat.h>
19 #include <sys/ioctl.h>
20 #include <linux/fs.h>
21 #include <libaio.h>
22 #include <errno.h>
23 #include <sys/mman.h>
24 #include <sys/select.h>
25
26 #include "vector.h"
27 #include "memory.h"
28 #include "checkers.h"
29 #include "config.h"
30 #include "structs.h"
31 #include "structs_vec.h"
32 #include "devmapper.h"
33 #include "debug.h"
34 #include "lock.h"
35 #include "time-util.h"
36 #include "io_err_stat.h"
37
38 #define IOTIMEOUT_SEC                   60
39 #define TIMEOUT_NO_IO_NSEC              10000000 /*10ms = 10000000ns*/
40 #define FLAKY_PATHFAIL_THRESHOLD        2
41 #define CONCUR_NR_EVENT                 32
42
43 #define PATH_IO_ERR_IN_CHECKING         -1
44 #define PATH_IO_ERR_IN_POLLING_RECHECK  -2
45
46 #define io_err_stat_log(prio, fmt, args...) \
47         condlog(prio, "io error statistic: " fmt, ##args)
48
49
50 struct io_err_stat_pathvec {
51         pthread_mutex_t mutex;
52         vector          pathvec;
53 };
54
55 struct dio_ctx {
56         struct timespec io_starttime;
57         int             blksize;
58         void            *buf;
59         struct iocb     io;
60 };
61
62 struct io_err_stat_path {
63         char            devname[FILE_NAME_SIZE];
64         int             fd;
65         struct dio_ctx  *dio_ctx_array;
66         int             io_err_nr;
67         int             io_nr;
68         struct timespec start_time;
69
70         int             total_time;
71         int             err_rate_threshold;
72 };
73
74 pthread_t               io_err_stat_thr;
75 pthread_attr_t          io_err_stat_attr;
76
77 static pthread_mutex_t io_err_thread_lock = PTHREAD_MUTEX_INITIALIZER;
78 static pthread_cond_t io_err_thread_cond = PTHREAD_COND_INITIALIZER;
79 static int io_err_thread_running = 0;
80
81 static struct io_err_stat_pathvec *paths;
82 struct vectors *vecs;
83 io_context_t    ioctx;
84
85 static void cancel_inflight_io(struct io_err_stat_path *pp);
86
87 static void rcu_unregister(void *param)
88 {
89         rcu_unregister_thread();
90 }
91
92 struct io_err_stat_path *find_err_path_by_dev(vector pathvec, char *dev)
93 {
94         int i;
95         struct io_err_stat_path *pp;
96
97         if (!pathvec)
98                 return NULL;
99         vector_foreach_slot(pathvec, pp, i)
100                 if (!strcmp(pp->devname, dev))
101                         return pp;
102
103         io_err_stat_log(4, "%s: not found in check queue", dev);
104
105         return NULL;
106 }
107
108 static int init_each_dio_ctx(struct dio_ctx *ct, int blksize,
109                 unsigned long pgsize)
110 {
111         ct->blksize = blksize;
112         if (posix_memalign(&ct->buf, pgsize, blksize))
113                 return 1;
114         memset(ct->buf, 0, blksize);
115         ct->io_starttime.tv_sec = 0;
116         ct->io_starttime.tv_nsec = 0;
117
118         return 0;
119 }
120
121 static void deinit_each_dio_ctx(struct dio_ctx *ct)
122 {
123         if (ct->buf)
124                 free(ct->buf);
125 }
126
127 static int setup_directio_ctx(struct io_err_stat_path *p)
128 {
129         unsigned long pgsize = getpagesize();
130         char fpath[PATH_MAX];
131         int blksize = 0;
132         int i;
133
134         if (snprintf(fpath, PATH_MAX, "/dev/%s", p->devname) >= PATH_MAX)
135                 return 1;
136         if (p->fd < 0)
137                 p->fd = open(fpath, O_RDONLY | O_DIRECT);
138         if (p->fd < 0)
139                 return 1;
140
141         p->dio_ctx_array = MALLOC(sizeof(struct dio_ctx) * CONCUR_NR_EVENT);
142         if (!p->dio_ctx_array)
143                 goto fail_close;
144
145         if (ioctl(p->fd, BLKBSZGET, &blksize) < 0) {
146                 io_err_stat_log(4, "%s:cannot get blocksize, set default 512",
147                                 p->devname);
148                 blksize = 512;
149         }
150         if (!blksize)
151                 goto free_pdctx;
152
153         for (i = 0; i < CONCUR_NR_EVENT; i++) {
154                 if (init_each_dio_ctx(p->dio_ctx_array + i, blksize, pgsize))
155                         goto deinit;
156         }
157         return 0;
158
159 deinit:
160         for (i = 0; i < CONCUR_NR_EVENT; i++)
161                 deinit_each_dio_ctx(p->dio_ctx_array + i);
162 free_pdctx:
163         FREE(p->dio_ctx_array);
164 fail_close:
165         close(p->fd);
166
167         return 1;
168 }
169
170 static void destroy_directio_ctx(struct io_err_stat_path *p)
171 {
172         int i;
173
174         if (!p || !p->dio_ctx_array)
175                 return;
176         cancel_inflight_io(p);
177
178         for (i = 0; i < CONCUR_NR_EVENT; i++)
179                 deinit_each_dio_ctx(p->dio_ctx_array + i);
180         FREE(p->dio_ctx_array);
181
182         if (p->fd > 0)
183                 close(p->fd);
184 }
185
186 static struct io_err_stat_path *alloc_io_err_stat_path(void)
187 {
188         struct io_err_stat_path *p;
189
190         p = (struct io_err_stat_path *)MALLOC(sizeof(*p));
191         if (!p)
192                 return NULL;
193
194         memset(p->devname, 0, sizeof(p->devname));
195         p->io_err_nr = 0;
196         p->io_nr = 0;
197         p->total_time = 0;
198         p->start_time.tv_sec = 0;
199         p->start_time.tv_nsec = 0;
200         p->err_rate_threshold = 0;
201         p->fd = -1;
202
203         return p;
204 }
205
206 static void free_io_err_stat_path(struct io_err_stat_path *p)
207 {
208         FREE(p);
209 }
210
211 static struct io_err_stat_pathvec *alloc_pathvec(void)
212 {
213         struct io_err_stat_pathvec *p;
214         int r;
215
216         p = (struct io_err_stat_pathvec *)MALLOC(sizeof(*p));
217         if (!p)
218                 return NULL;
219         p->pathvec = vector_alloc();
220         if (!p->pathvec)
221                 goto out_free_struct_pathvec;
222         r = pthread_mutex_init(&p->mutex, NULL);
223         if (r)
224                 goto out_free_member_pathvec;
225
226         return p;
227
228 out_free_member_pathvec:
229         vector_free(p->pathvec);
230 out_free_struct_pathvec:
231         FREE(p);
232         return NULL;
233 }
234
235 static void free_io_err_pathvec(struct io_err_stat_pathvec *p)
236 {
237         struct io_err_stat_path *path;
238         int i;
239
240         if (!p)
241                 return;
242         pthread_mutex_destroy(&p->mutex);
243         if (!p->pathvec) {
244                 vector_foreach_slot(p->pathvec, path, i) {
245                         destroy_directio_ctx(path);
246                         free_io_err_stat_path(path);
247                 }
248                 vector_free(p->pathvec);
249         }
250         FREE(p);
251 }
252
253 /*
254  * return value
255  * 0: enqueue OK
256  * 1: fails because of internal error
257  * 2: fails because of existing already
258  */
259 static int enqueue_io_err_stat_by_path(struct path *path)
260 {
261         struct io_err_stat_path *p;
262
263         pthread_mutex_lock(&paths->mutex);
264         p = find_err_path_by_dev(paths->pathvec, path->dev);
265         if (p) {
266                 pthread_mutex_unlock(&paths->mutex);
267                 return 2;
268         }
269         pthread_mutex_unlock(&paths->mutex);
270
271         p = alloc_io_err_stat_path();
272         if (!p)
273                 return 1;
274
275         memcpy(p->devname, path->dev, sizeof(p->devname));
276         p->total_time = path->mpp->marginal_path_err_sample_time;
277         p->err_rate_threshold = path->mpp->marginal_path_err_rate_threshold;
278
279         if (setup_directio_ctx(p))
280                 goto free_ioerr_path;
281         pthread_mutex_lock(&paths->mutex);
282         if (!vector_alloc_slot(paths->pathvec))
283                 goto unlock_destroy;
284         vector_set_slot(paths->pathvec, p);
285         pthread_mutex_unlock(&paths->mutex);
286
287         if (!path->io_err_disable_reinstate) {
288                 /*
289                  *fail the path in the kernel for the time of the to make
290                  *the test more reliable
291                  */
292                 io_err_stat_log(3, "%s: fail dm path %s before checking",
293                                 path->mpp->alias, path->dev);
294                 path->io_err_disable_reinstate = 1;
295                 dm_fail_path(path->mpp->alias, path->dev_t);
296                 update_queue_mode_del_path(path->mpp);
297
298                 /*
299                  * schedule path check as soon as possible to
300                  * update path state to delayed state
301                  */
302                 path->tick = 1;
303
304         }
305         io_err_stat_log(2, "%s: enqueue path %s to check",
306                         path->mpp->alias, path->dev);
307         return 0;
308
309 unlock_destroy:
310         pthread_mutex_unlock(&paths->mutex);
311         destroy_directio_ctx(p);
312 free_ioerr_path:
313         free_io_err_stat_path(p);
314
315         return 1;
316 }
317
318 int io_err_stat_handle_pathfail(struct path *path)
319 {
320         struct timespec curr_time;
321         int res;
322
323         if (uatomic_read(&io_err_thread_running) == 0)
324                 return 1;
325
326         if (path->io_err_disable_reinstate) {
327                 io_err_stat_log(3, "%s: reinstate is already disabled",
328                                 path->dev);
329                 return 1;
330         }
331         if (path->io_err_pathfail_cnt < 0)
332                 return 1;
333
334         if (!path->mpp)
335                 return 1;
336         if (path->mpp->nr_active <= 1)
337                 return 1;
338         if (path->mpp->marginal_path_double_failed_time <= 0 ||
339                 path->mpp->marginal_path_err_sample_time <= 0 ||
340                 path->mpp->marginal_path_err_recheck_gap_time <= 0 ||
341                 path->mpp->marginal_path_err_rate_threshold < 0) {
342                 io_err_stat_log(4, "%s: parameter not set", path->mpp->alias);
343                 return 1;
344         }
345         if (path->mpp->marginal_path_err_sample_time < (2 * IOTIMEOUT_SEC)) {
346                 io_err_stat_log(2, "%s: marginal_path_err_sample_time should not less than %d",
347                                 path->mpp->alias, 2 * IOTIMEOUT_SEC);
348                 return 1;
349         }
350         /*
351          * The test should only be started for paths that have failed
352          * repeatedly in a certain time frame, so that we have reason
353          * to assume they're flaky. Without bother the admin to configure
354          * the repeated count threshold and time frame, we assume a path
355          * which fails at least twice within 60 seconds is flaky.
356          */
357         if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
358                 return 1;
359         if (path->io_err_pathfail_cnt == 0) {
360                 path->io_err_pathfail_cnt++;
361                 path->io_err_pathfail_starttime = curr_time.tv_sec;
362                 io_err_stat_log(5, "%s: start path flakiness pre-checking",
363                                 path->dev);
364                 return 0;
365         }
366         if ((curr_time.tv_sec - path->io_err_pathfail_starttime) >
367                         path->mpp->marginal_path_double_failed_time) {
368                 path->io_err_pathfail_cnt = 0;
369                 path->io_err_pathfail_starttime = curr_time.tv_sec;
370                 io_err_stat_log(5, "%s: restart path flakiness pre-checking",
371                                 path->dev);
372         }
373         path->io_err_pathfail_cnt++;
374         if (path->io_err_pathfail_cnt >= FLAKY_PATHFAIL_THRESHOLD) {
375                 res = enqueue_io_err_stat_by_path(path);
376                 if (!res)
377                         path->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
378                 else
379                         path->io_err_pathfail_cnt = 0;
380         }
381
382         return 0;
383 }
384
385 int hit_io_err_recheck_time(struct path *pp)
386 {
387         struct timespec curr_time;
388         int r;
389
390         if (uatomic_read(&io_err_thread_running) == 0)
391                 return 0;
392         if (pp->mpp->nr_active <= 0) {
393                 io_err_stat_log(2, "%s: recover path early", pp->dev);
394                 goto recover;
395         }
396         if (pp->io_err_pathfail_cnt != PATH_IO_ERR_IN_POLLING_RECHECK)
397                 return 1;
398         if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0 ||
399             (curr_time.tv_sec - pp->io_err_dis_reinstate_time) >
400                         pp->mpp->marginal_path_err_recheck_gap_time) {
401                 io_err_stat_log(4, "%s: reschedule checking after %d seconds",
402                                 pp->dev,
403                                 pp->mpp->marginal_path_err_recheck_gap_time);
404                 /*
405                  * to reschedule io error checking again
406                  * if the path is good enough, we claim it is good
407                  * and can be reinsated as soon as possible in the
408                  * check_path routine.
409                  */
410                 pp->io_err_dis_reinstate_time = curr_time.tv_sec;
411                 r = enqueue_io_err_stat_by_path(pp);
412                 /*
413                  * Enqueue fails because of internal error.
414                  * In this case , we recover this path
415                  * Or else,  return 1 to set path state to PATH_SHAKY
416                  */
417                 if (r == 1) {
418                         io_err_stat_log(3, "%s: enqueue fails, to recover",
419                                         pp->dev);
420                         goto recover;
421                 } else if (!r) {
422                         pp->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
423                 }
424         }
425
426         return 1;
427
428 recover:
429         pp->io_err_pathfail_cnt = 0;
430         pp->io_err_disable_reinstate = 0;
431         pp->tick = 1;
432         return 0;
433 }
434
435 static int delete_io_err_stat_by_addr(struct io_err_stat_path *p)
436 {
437         int i;
438
439         i = find_slot(paths->pathvec, p);
440         if (i != -1)
441                 vector_del_slot(paths->pathvec, i);
442
443         destroy_directio_ctx(p);
444         free_io_err_stat_path(p);
445
446         return 0;
447 }
448
449 static void account_async_io_state(struct io_err_stat_path *pp, int rc)
450 {
451         switch (rc) {
452         case PATH_DOWN:
453         case PATH_TIMEOUT:
454                 pp->io_err_nr++;
455                 break;
456         case PATH_UNCHECKED:
457         case PATH_UP:
458         case PATH_PENDING:
459                 break;
460         default:
461                 break;
462         }
463 }
464
465 static int poll_io_err_stat(struct vectors *vecs, struct io_err_stat_path *pp)
466 {
467         struct timespec currtime, difftime;
468         struct path *path;
469         double err_rate;
470
471         if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
472                 return 1;
473         timespecsub(&currtime, &pp->start_time, &difftime);
474         if (difftime.tv_sec < pp->total_time)
475                 return 0;
476
477         io_err_stat_log(4, "%s: check end", pp->devname);
478
479         err_rate = pp->io_nr == 0 ? 0 : (pp->io_err_nr * 1000.0f) / pp->io_nr;
480         io_err_stat_log(3, "%s: IO error rate (%.1f/1000)",
481                         pp->devname, err_rate);
482         pthread_cleanup_push(cleanup_lock, &vecs->lock);
483         lock(&vecs->lock);
484         pthread_testcancel();
485         path = find_path_by_dev(vecs->pathvec, pp->devname);
486         if (!path) {
487                 io_err_stat_log(4, "path %s not found'", pp->devname);
488         } else if (err_rate <= pp->err_rate_threshold) {
489                 path->io_err_pathfail_cnt = 0;
490                 path->io_err_disable_reinstate = 0;
491                 io_err_stat_log(3, "%s: (%d/%d) good to enable reinstating",
492                                 pp->devname, pp->io_err_nr, pp->io_nr);
493                 /*
494                  * schedule path check as soon as possible to
495                  * update path state. Do NOT reinstate dm path here
496                  */
497                 path->tick = 1;
498
499         } else if (path->mpp && path->mpp->nr_active > 1) {
500                 io_err_stat_log(3, "%s: keep failing the dm path %s",
501                                 path->mpp->alias, path->dev);
502                 path->io_err_pathfail_cnt = PATH_IO_ERR_IN_POLLING_RECHECK;
503                 path->io_err_disable_reinstate = 1;
504                 path->io_err_dis_reinstate_time = currtime.tv_sec;
505                 io_err_stat_log(3, "%s: disable reinstating of %s",
506                                 path->mpp->alias, path->dev);
507         } else {
508                 path->io_err_pathfail_cnt = 0;
509                 path->io_err_disable_reinstate = 0;
510                 io_err_stat_log(3, "%s: there is orphan path, enable reinstating",
511                                 pp->devname);
512         }
513         lock_cleanup_pop(vecs->lock);
514
515         delete_io_err_stat_by_addr(pp);
516
517         return 0;
518 }
519
520 static int send_each_async_io(struct dio_ctx *ct, int fd, char *dev)
521 {
522         int rc = -1;
523
524         if (ct->io_starttime.tv_nsec == 0 &&
525                         ct->io_starttime.tv_sec == 0) {
526                 struct iocb *ios[1] = { &ct->io };
527
528                 if (clock_gettime(CLOCK_MONOTONIC, &ct->io_starttime) != 0) {
529                         ct->io_starttime.tv_sec = 0;
530                         ct->io_starttime.tv_nsec = 0;
531                         return rc;
532                 }
533                 io_prep_pread(&ct->io, fd, ct->buf, ct->blksize, 0);
534                 if (io_submit(ioctx, 1, ios) != 1) {
535                         io_err_stat_log(5, "%s: io_submit error %i",
536                                         dev, errno);
537                         return rc;
538                 }
539                 rc = 0;
540         }
541
542         return rc;
543 }
544
545 static void send_batch_async_ios(struct io_err_stat_path *pp)
546 {
547         int i;
548         struct dio_ctx *ct;
549         struct timespec currtime, difftime;
550
551         if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
552                 return;
553         /*
554          * Give a free time for all IO to complete or timeout
555          */
556         if (pp->start_time.tv_sec != 0) {
557                 timespecsub(&currtime, &pp->start_time, &difftime);
558                 if (difftime.tv_sec + IOTIMEOUT_SEC >= pp->total_time)
559                         return;
560         }
561
562         for (i = 0; i < CONCUR_NR_EVENT; i++) {
563                 ct = pp->dio_ctx_array + i;
564                 if (!send_each_async_io(ct, pp->fd, pp->devname))
565                         pp->io_nr++;
566         }
567         if (pp->start_time.tv_sec == 0 && pp->start_time.tv_nsec == 0 &&
568                 clock_gettime(CLOCK_MONOTONIC, &pp->start_time)) {
569                 pp->start_time.tv_sec = 0;
570                 pp->start_time.tv_nsec = 0;
571         }
572 }
573
574 static int try_to_cancel_timeout_io(struct dio_ctx *ct, struct timespec *t,
575                 char *dev)
576 {
577         struct timespec difftime;
578         struct io_event event;
579         int             rc = PATH_UNCHECKED;
580         int             r;
581
582         if (ct->io_starttime.tv_sec == 0)
583                 return rc;
584         timespecsub(t, &ct->io_starttime, &difftime);
585         if (difftime.tv_sec > IOTIMEOUT_SEC) {
586                 struct iocb *ios[1] = { &ct->io };
587
588                 io_err_stat_log(5, "%s: abort check on timeout", dev);
589                 r = io_cancel(ioctx, ios[0], &event);
590                 if (r)
591                         io_err_stat_log(5, "%s: io_cancel error %i",
592                                         dev, errno);
593                 ct->io_starttime.tv_sec = 0;
594                 ct->io_starttime.tv_nsec = 0;
595                 rc = PATH_TIMEOUT;
596         } else {
597                 rc = PATH_PENDING;
598         }
599
600         return rc;
601 }
602
603 static void poll_async_io_timeout(void)
604 {
605         struct io_err_stat_path *pp;
606         struct timespec curr_time;
607         int             rc = PATH_UNCHECKED;
608         int             i, j;
609
610         if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
611                 return;
612         vector_foreach_slot(paths->pathvec, pp, i) {
613                 for (j = 0; j < CONCUR_NR_EVENT; j++) {
614                         rc = try_to_cancel_timeout_io(pp->dio_ctx_array + j,
615                                         &curr_time, pp->devname);
616                         account_async_io_state(pp, rc);
617                 }
618         }
619 }
620
621 static void cancel_inflight_io(struct io_err_stat_path *pp)
622 {
623         struct io_event event;
624         int i, r;
625
626         for (i = 0; i < CONCUR_NR_EVENT; i++) {
627                 struct dio_ctx *ct = pp->dio_ctx_array + i;
628                 struct iocb *ios[1] = { &ct->io };
629
630                 if (ct->io_starttime.tv_sec == 0
631                                 && ct->io_starttime.tv_nsec == 0)
632                         continue;
633                 io_err_stat_log(5, "%s: abort infligh io",
634                                 pp->devname);
635                 r = io_cancel(ioctx, ios[0], &event);
636                 if (r)
637                         io_err_stat_log(5, "%s: io_cancel error %d, %i",
638                                         pp->devname, r, errno);
639                 ct->io_starttime.tv_sec = 0;
640                 ct->io_starttime.tv_nsec = 0;
641         }
642 }
643
644 static inline int handle_done_dio_ctx(struct dio_ctx *ct, struct io_event *ev)
645 {
646         ct->io_starttime.tv_sec = 0;
647         ct->io_starttime.tv_nsec = 0;
648         return (ev->res == ct->blksize) ? PATH_UP : PATH_DOWN;
649 }
650
651 static void handle_async_io_done_event(struct io_event *io_evt)
652 {
653         struct io_err_stat_path *pp;
654         struct dio_ctx *ct;
655         int rc = PATH_UNCHECKED;
656         int i, j;
657
658         vector_foreach_slot(paths->pathvec, pp, i) {
659                 for (j = 0; j < CONCUR_NR_EVENT; j++) {
660                         ct = pp->dio_ctx_array + j;
661                         if (&ct->io == io_evt->obj) {
662                                 rc = handle_done_dio_ctx(ct, io_evt);
663                                 account_async_io_state(pp, rc);
664                                 return;
665                         }
666                 }
667         }
668 }
669
670 static void process_async_ios_event(int timeout_nsecs, char *dev)
671 {
672         struct io_event events[CONCUR_NR_EVENT];
673         int             i, n;
674         struct timespec timeout = { .tv_nsec = timeout_nsecs };
675
676         errno = 0;
677         n = io_getevents(ioctx, 1L, CONCUR_NR_EVENT, events, &timeout);
678         if (n < 0) {
679                 io_err_stat_log(3, "%s: async io events returned %d (errno=%s)",
680                                 dev, n, strerror(errno));
681         } else {
682                 for (i = 0; i < n; i++)
683                         handle_async_io_done_event(&events[i]);
684         }
685 }
686
687 static void service_paths(void)
688 {
689         struct io_err_stat_path *pp;
690         int i;
691
692         pthread_mutex_lock(&paths->mutex);
693         vector_foreach_slot(paths->pathvec, pp, i) {
694                 send_batch_async_ios(pp);
695                 process_async_ios_event(TIMEOUT_NO_IO_NSEC, pp->devname);
696                 poll_async_io_timeout();
697                 poll_io_err_stat(vecs, pp);
698         }
699         pthread_mutex_unlock(&paths->mutex);
700 }
701
702 static void cleanup_unlock(void *arg)
703 {
704         pthread_mutex_unlock((pthread_mutex_t*) arg);
705 }
706
707 static void cleanup_exited(void *arg)
708 {
709         uatomic_set(&io_err_thread_running, 0);
710 }
711
712 static void *io_err_stat_loop(void *data)
713 {
714         sigset_t set;
715
716         vecs = (struct vectors *)data;
717         pthread_cleanup_push(rcu_unregister, NULL);
718         rcu_register_thread();
719
720         pthread_cleanup_push(cleanup_exited, NULL);
721
722         sigfillset(&set);
723         sigdelset(&set, SIGUSR2);
724
725         mlockall(MCL_CURRENT | MCL_FUTURE);
726
727         pthread_mutex_lock(&io_err_thread_lock);
728         uatomic_set(&io_err_thread_running, 1);
729         pthread_cond_broadcast(&io_err_thread_cond);
730         pthread_mutex_unlock(&io_err_thread_lock);
731
732         while (1) {
733                 struct timespec ts;
734
735                 service_paths();
736
737                 ts.tv_sec = 0;
738                 ts.tv_nsec = 100 * 1000 * 1000;
739                 /*
740                  * pselect() with no fds, a timeout, and a sigmask:
741                  * sleep for 100ms and react on SIGUSR2.
742                  */
743                 pselect(1, NULL, NULL, NULL, &ts, &set);
744         }
745
746         pthread_cleanup_pop(1);
747         pthread_cleanup_pop(1);
748         return NULL;
749 }
750
751 int start_io_err_stat_thread(void *data)
752 {
753         int ret;
754
755         if (uatomic_read(&io_err_thread_running) == 1)
756                 return 0;
757
758         if (io_setup(CONCUR_NR_EVENT, &ioctx) != 0) {
759                 io_err_stat_log(4, "io_setup failed");
760                 return 1;
761         }
762         paths = alloc_pathvec();
763         if (!paths)
764                 goto destroy_ctx;
765
766         pthread_mutex_lock(&io_err_thread_lock);
767         pthread_cleanup_push(cleanup_unlock, &io_err_thread_lock);
768
769         ret = pthread_create(&io_err_stat_thr, &io_err_stat_attr,
770                              io_err_stat_loop, data);
771
772         while (!ret && !uatomic_read(&io_err_thread_running) &&
773                pthread_cond_wait(&io_err_thread_cond,
774                                  &io_err_thread_lock) == 0);
775
776         pthread_cleanup_pop(1);
777
778         if (ret) {
779                 io_err_stat_log(0, "cannot create io_error statistic thread");
780                 goto out_free;
781         }
782
783         io_err_stat_log(2, "io_error statistic thread started");
784         return 0;
785
786 out_free:
787         free_io_err_pathvec(paths);
788 destroy_ctx:
789         io_destroy(ioctx);
790         io_err_stat_log(0, "failed to start io_error statistic thread");
791         return 1;
792 }
793
794 void stop_io_err_stat_thread(void)
795 {
796         if (io_err_stat_thr == (pthread_t)0)
797                 return;
798
799         if (uatomic_read(&io_err_thread_running) == 1)
800                 pthread_cancel(io_err_stat_thr);
801
802         pthread_join(io_err_stat_thr, NULL);
803         free_io_err_pathvec(paths);
804         io_destroy(ioctx);
805 }