2 * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved.
7 * IO error stream statistic process for path failure event from kernel
9 * Author(s): Guan Junxiong 2017 <guanjunxiong@huawei.com>
11 * This file is released under the GPL version 2, or any later version.
19 #include <sys/ioctl.h>
24 #include <sys/select.h>
31 #include "structs_vec.h"
32 #include "devmapper.h"
35 #include "time-util.h"
36 #include "io_err_stat.h"
38 #define IOTIMEOUT_SEC 60
39 #define TIMEOUT_NO_IO_NSEC 10000000 /*10ms = 10000000ns*/
40 #define FLAKY_PATHFAIL_THRESHOLD 2
41 #define CONCUR_NR_EVENT 32
43 #define PATH_IO_ERR_IN_CHECKING -1
44 #define PATH_IO_ERR_IN_POLLING_RECHECK -2
46 #define io_err_stat_log(prio, fmt, args...) \
47 condlog(prio, "io error statistic: " fmt, ##args)
50 struct io_err_stat_pathvec {
51 pthread_mutex_t mutex;
56 struct timespec io_starttime;
62 struct io_err_stat_path {
63 char devname[FILE_NAME_SIZE];
65 struct dio_ctx *dio_ctx_array;
68 struct timespec start_time;
71 int err_rate_threshold;
74 pthread_t io_err_stat_thr;
75 pthread_attr_t io_err_stat_attr;
77 static struct io_err_stat_pathvec *paths;
81 static void cancel_inflight_io(struct io_err_stat_path *pp);
83 static void rcu_unregister(void *param)
85 rcu_unregister_thread();
88 struct io_err_stat_path *find_err_path_by_dev(vector pathvec, char *dev)
91 struct io_err_stat_path *pp;
95 vector_foreach_slot(pathvec, pp, i)
96 if (!strcmp(pp->devname, dev))
99 io_err_stat_log(4, "%s: not found in check queue", dev);
104 static int init_each_dio_ctx(struct dio_ctx *ct, int blksize,
105 unsigned long pgsize)
107 ct->blksize = blksize;
108 if (posix_memalign(&ct->buf, pgsize, blksize))
110 memset(ct->buf, 0, blksize);
111 ct->io_starttime.tv_sec = 0;
112 ct->io_starttime.tv_nsec = 0;
117 static void deinit_each_dio_ctx(struct dio_ctx *ct)
123 static int setup_directio_ctx(struct io_err_stat_path *p)
125 unsigned long pgsize = getpagesize();
126 char fpath[PATH_MAX];
130 if (snprintf(fpath, PATH_MAX, "/dev/%s", p->devname) >= PATH_MAX)
133 p->fd = open(fpath, O_RDONLY | O_DIRECT);
137 p->dio_ctx_array = MALLOC(sizeof(struct dio_ctx) * CONCUR_NR_EVENT);
138 if (!p->dio_ctx_array)
141 if (ioctl(p->fd, BLKBSZGET, &blksize) < 0) {
142 io_err_stat_log(4, "%s:cannot get blocksize, set default 512",
149 for (i = 0; i < CONCUR_NR_EVENT; i++) {
150 if (init_each_dio_ctx(p->dio_ctx_array + i, blksize, pgsize))
156 for (i = 0; i < CONCUR_NR_EVENT; i++)
157 deinit_each_dio_ctx(p->dio_ctx_array + i);
159 FREE(p->dio_ctx_array);
166 static void destroy_directio_ctx(struct io_err_stat_path *p)
170 if (!p || !p->dio_ctx_array)
172 cancel_inflight_io(p);
174 for (i = 0; i < CONCUR_NR_EVENT; i++)
175 deinit_each_dio_ctx(p->dio_ctx_array + i);
176 FREE(p->dio_ctx_array);
182 static struct io_err_stat_path *alloc_io_err_stat_path(void)
184 struct io_err_stat_path *p;
186 p = (struct io_err_stat_path *)MALLOC(sizeof(*p));
190 memset(p->devname, 0, sizeof(p->devname));
194 p->start_time.tv_sec = 0;
195 p->start_time.tv_nsec = 0;
196 p->err_rate_threshold = 0;
202 static void free_io_err_stat_path(struct io_err_stat_path *p)
207 static struct io_err_stat_pathvec *alloc_pathvec(void)
209 struct io_err_stat_pathvec *p;
212 p = (struct io_err_stat_pathvec *)MALLOC(sizeof(*p));
215 p->pathvec = vector_alloc();
217 goto out_free_struct_pathvec;
218 r = pthread_mutex_init(&p->mutex, NULL);
220 goto out_free_member_pathvec;
224 out_free_member_pathvec:
225 vector_free(p->pathvec);
226 out_free_struct_pathvec:
231 static void free_io_err_pathvec(struct io_err_stat_pathvec *p)
233 struct io_err_stat_path *path;
238 pthread_mutex_destroy(&p->mutex);
240 vector_foreach_slot(p->pathvec, path, i) {
241 destroy_directio_ctx(path);
242 free_io_err_stat_path(path);
244 vector_free(p->pathvec);
252 * 1: fails because of internal error
253 * 2: fails because of existing already
255 static int enqueue_io_err_stat_by_path(struct path *path)
257 struct io_err_stat_path *p;
259 pthread_mutex_lock(&paths->mutex);
260 p = find_err_path_by_dev(paths->pathvec, path->dev);
262 pthread_mutex_unlock(&paths->mutex);
265 pthread_mutex_unlock(&paths->mutex);
267 p = alloc_io_err_stat_path();
271 memcpy(p->devname, path->dev, sizeof(p->devname));
272 p->total_time = path->mpp->marginal_path_err_sample_time;
273 p->err_rate_threshold = path->mpp->marginal_path_err_rate_threshold;
275 if (setup_directio_ctx(p))
276 goto free_ioerr_path;
277 pthread_mutex_lock(&paths->mutex);
278 if (!vector_alloc_slot(paths->pathvec))
280 vector_set_slot(paths->pathvec, p);
281 pthread_mutex_unlock(&paths->mutex);
283 if (!path->io_err_disable_reinstate) {
285 *fail the path in the kernel for the time of the to make
286 *the test more reliable
288 io_err_stat_log(3, "%s: fail dm path %s before checking",
289 path->mpp->alias, path->dev);
290 path->io_err_disable_reinstate = 1;
291 dm_fail_path(path->mpp->alias, path->dev_t);
292 update_queue_mode_del_path(path->mpp);
295 * schedule path check as soon as possible to
296 * update path state to delayed state
301 io_err_stat_log(2, "%s: enqueue path %s to check",
302 path->mpp->alias, path->dev);
306 pthread_mutex_unlock(&paths->mutex);
307 destroy_directio_ctx(p);
309 free_io_err_stat_path(p);
314 int io_err_stat_handle_pathfail(struct path *path)
316 struct timespec curr_time;
319 if (path->io_err_disable_reinstate) {
320 io_err_stat_log(3, "%s: reinstate is already disabled",
324 if (path->io_err_pathfail_cnt < 0)
329 if (path->mpp->nr_active <= 1)
331 if (path->mpp->marginal_path_double_failed_time <= 0 ||
332 path->mpp->marginal_path_err_sample_time <= 0 ||
333 path->mpp->marginal_path_err_recheck_gap_time <= 0 ||
334 path->mpp->marginal_path_err_rate_threshold < 0) {
335 io_err_stat_log(4, "%s: parameter not set", path->mpp->alias);
338 if (path->mpp->marginal_path_err_sample_time < (2 * IOTIMEOUT_SEC)) {
339 io_err_stat_log(2, "%s: marginal_path_err_sample_time should not less than %d",
340 path->mpp->alias, 2 * IOTIMEOUT_SEC);
344 * The test should only be started for paths that have failed
345 * repeatedly in a certain time frame, so that we have reason
346 * to assume they're flaky. Without bother the admin to configure
347 * the repeated count threshold and time frame, we assume a path
348 * which fails at least twice within 60 seconds is flaky.
350 if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
352 if (path->io_err_pathfail_cnt == 0) {
353 path->io_err_pathfail_cnt++;
354 path->io_err_pathfail_starttime = curr_time.tv_sec;
355 io_err_stat_log(5, "%s: start path flakiness pre-checking",
359 if ((curr_time.tv_sec - path->io_err_pathfail_starttime) >
360 path->mpp->marginal_path_double_failed_time) {
361 path->io_err_pathfail_cnt = 0;
362 path->io_err_pathfail_starttime = curr_time.tv_sec;
363 io_err_stat_log(5, "%s: restart path flakiness pre-checking",
366 path->io_err_pathfail_cnt++;
367 if (path->io_err_pathfail_cnt >= FLAKY_PATHFAIL_THRESHOLD) {
368 res = enqueue_io_err_stat_by_path(path);
370 path->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
372 path->io_err_pathfail_cnt = 0;
378 int hit_io_err_recheck_time(struct path *pp)
380 struct timespec curr_time;
383 if (pp->mpp->nr_active <= 0) {
384 io_err_stat_log(2, "%s: recover path early", pp->dev);
387 if (pp->io_err_pathfail_cnt != PATH_IO_ERR_IN_POLLING_RECHECK)
389 if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0 ||
390 (curr_time.tv_sec - pp->io_err_dis_reinstate_time) >
391 pp->mpp->marginal_path_err_recheck_gap_time) {
392 io_err_stat_log(4, "%s: reschedule checking after %d seconds",
394 pp->mpp->marginal_path_err_recheck_gap_time);
396 * to reschedule io error checking again
397 * if the path is good enough, we claim it is good
398 * and can be reinsated as soon as possible in the
399 * check_path routine.
401 pp->io_err_dis_reinstate_time = curr_time.tv_sec;
402 r = enqueue_io_err_stat_by_path(pp);
404 * Enqueue fails because of internal error.
405 * In this case , we recover this path
406 * Or else, return 1 to set path state to PATH_SHAKY
409 io_err_stat_log(3, "%s: enqueue fails, to recover",
413 pp->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
420 pp->io_err_pathfail_cnt = 0;
421 pp->io_err_disable_reinstate = 0;
426 static int delete_io_err_stat_by_addr(struct io_err_stat_path *p)
430 i = find_slot(paths->pathvec, p);
432 vector_del_slot(paths->pathvec, i);
434 destroy_directio_ctx(p);
435 free_io_err_stat_path(p);
440 static void account_async_io_state(struct io_err_stat_path *pp, int rc)
456 static int poll_io_err_stat(struct vectors *vecs, struct io_err_stat_path *pp)
458 struct timespec currtime, difftime;
462 if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
464 timespecsub(&currtime, &pp->start_time, &difftime);
465 if (difftime.tv_sec < pp->total_time)
468 io_err_stat_log(4, "%s: check end", pp->devname);
470 err_rate = pp->io_nr == 0 ? 0 : (pp->io_err_nr * 1000.0f) / pp->io_nr;
471 io_err_stat_log(3, "%s: IO error rate (%.1f/1000)",
472 pp->devname, err_rate);
473 pthread_cleanup_push(cleanup_lock, &vecs->lock);
475 pthread_testcancel();
476 path = find_path_by_dev(vecs->pathvec, pp->devname);
478 io_err_stat_log(4, "path %s not found'", pp->devname);
479 } else if (err_rate <= pp->err_rate_threshold) {
480 path->io_err_pathfail_cnt = 0;
481 path->io_err_disable_reinstate = 0;
482 io_err_stat_log(3, "%s: (%d/%d) good to enable reinstating",
483 pp->devname, pp->io_err_nr, pp->io_nr);
485 * schedule path check as soon as possible to
486 * update path state. Do NOT reinstate dm path here
490 } else if (path->mpp && path->mpp->nr_active > 1) {
491 io_err_stat_log(3, "%s: keep failing the dm path %s",
492 path->mpp->alias, path->dev);
493 path->io_err_pathfail_cnt = PATH_IO_ERR_IN_POLLING_RECHECK;
494 path->io_err_disable_reinstate = 1;
495 path->io_err_dis_reinstate_time = currtime.tv_sec;
496 io_err_stat_log(3, "%s: disable reinstating of %s",
497 path->mpp->alias, path->dev);
499 path->io_err_pathfail_cnt = 0;
500 path->io_err_disable_reinstate = 0;
501 io_err_stat_log(3, "%s: there is orphan path, enable reinstating",
504 lock_cleanup_pop(vecs->lock);
506 delete_io_err_stat_by_addr(pp);
511 static int send_each_async_io(struct dio_ctx *ct, int fd, char *dev)
515 if (ct->io_starttime.tv_nsec == 0 &&
516 ct->io_starttime.tv_sec == 0) {
517 struct iocb *ios[1] = { &ct->io };
519 if (clock_gettime(CLOCK_MONOTONIC, &ct->io_starttime) != 0) {
520 ct->io_starttime.tv_sec = 0;
521 ct->io_starttime.tv_nsec = 0;
524 io_prep_pread(&ct->io, fd, ct->buf, ct->blksize, 0);
525 if (io_submit(ioctx, 1, ios) != 1) {
526 io_err_stat_log(5, "%s: io_submit error %i",
536 static void send_batch_async_ios(struct io_err_stat_path *pp)
540 struct timespec currtime, difftime;
542 if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
545 * Give a free time for all IO to complete or timeout
547 if (pp->start_time.tv_sec != 0) {
548 timespecsub(&currtime, &pp->start_time, &difftime);
549 if (difftime.tv_sec + IOTIMEOUT_SEC >= pp->total_time)
553 for (i = 0; i < CONCUR_NR_EVENT; i++) {
554 ct = pp->dio_ctx_array + i;
555 if (!send_each_async_io(ct, pp->fd, pp->devname))
558 if (pp->start_time.tv_sec == 0 && pp->start_time.tv_nsec == 0 &&
559 clock_gettime(CLOCK_MONOTONIC, &pp->start_time)) {
560 pp->start_time.tv_sec = 0;
561 pp->start_time.tv_nsec = 0;
565 static int try_to_cancel_timeout_io(struct dio_ctx *ct, struct timespec *t,
568 struct timespec difftime;
569 struct io_event event;
570 int rc = PATH_UNCHECKED;
573 if (ct->io_starttime.tv_sec == 0)
575 timespecsub(t, &ct->io_starttime, &difftime);
576 if (difftime.tv_sec > IOTIMEOUT_SEC) {
577 struct iocb *ios[1] = { &ct->io };
579 io_err_stat_log(5, "%s: abort check on timeout", dev);
580 r = io_cancel(ioctx, ios[0], &event);
582 io_err_stat_log(5, "%s: io_cancel error %i",
584 ct->io_starttime.tv_sec = 0;
585 ct->io_starttime.tv_nsec = 0;
594 static void poll_async_io_timeout(void)
596 struct io_err_stat_path *pp;
597 struct timespec curr_time;
598 int rc = PATH_UNCHECKED;
601 if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
603 vector_foreach_slot(paths->pathvec, pp, i) {
604 for (j = 0; j < CONCUR_NR_EVENT; j++) {
605 rc = try_to_cancel_timeout_io(pp->dio_ctx_array + j,
606 &curr_time, pp->devname);
607 account_async_io_state(pp, rc);
612 static void cancel_inflight_io(struct io_err_stat_path *pp)
614 struct io_event event;
617 for (i = 0; i < CONCUR_NR_EVENT; i++) {
618 struct dio_ctx *ct = pp->dio_ctx_array + i;
619 struct iocb *ios[1] = { &ct->io };
621 if (ct->io_starttime.tv_sec == 0
622 && ct->io_starttime.tv_nsec == 0)
624 io_err_stat_log(5, "%s: abort infligh io",
626 r = io_cancel(ioctx, ios[0], &event);
628 io_err_stat_log(5, "%s: io_cancel error %d, %i",
629 pp->devname, r, errno);
630 ct->io_starttime.tv_sec = 0;
631 ct->io_starttime.tv_nsec = 0;
635 static inline int handle_done_dio_ctx(struct dio_ctx *ct, struct io_event *ev)
637 ct->io_starttime.tv_sec = 0;
638 ct->io_starttime.tv_nsec = 0;
639 return (ev->res == ct->blksize) ? PATH_UP : PATH_DOWN;
642 static void handle_async_io_done_event(struct io_event *io_evt)
644 struct io_err_stat_path *pp;
646 int rc = PATH_UNCHECKED;
649 vector_foreach_slot(paths->pathvec, pp, i) {
650 for (j = 0; j < CONCUR_NR_EVENT; j++) {
651 ct = pp->dio_ctx_array + j;
652 if (&ct->io == io_evt->obj) {
653 rc = handle_done_dio_ctx(ct, io_evt);
654 account_async_io_state(pp, rc);
661 static void process_async_ios_event(int timeout_nsecs, char *dev)
663 struct io_event events[CONCUR_NR_EVENT];
665 struct timespec timeout = { .tv_nsec = timeout_nsecs };
668 n = io_getevents(ioctx, 1L, CONCUR_NR_EVENT, events, &timeout);
670 io_err_stat_log(3, "%s: async io events returned %d (errno=%s)",
671 dev, n, strerror(errno));
673 for (i = 0; i < n; i++)
674 handle_async_io_done_event(&events[i]);
678 static void service_paths(void)
680 struct io_err_stat_path *pp;
683 pthread_mutex_lock(&paths->mutex);
684 vector_foreach_slot(paths->pathvec, pp, i) {
685 send_batch_async_ios(pp);
686 process_async_ios_event(TIMEOUT_NO_IO_NSEC, pp->devname);
687 poll_async_io_timeout();
688 poll_io_err_stat(vecs, pp);
690 pthread_mutex_unlock(&paths->mutex);
693 static void *io_err_stat_loop(void *data)
697 vecs = (struct vectors *)data;
698 pthread_cleanup_push(rcu_unregister, NULL);
699 rcu_register_thread();
702 sigdelset(&set, SIGUSR2);
704 mlockall(MCL_CURRENT | MCL_FUTURE);
711 ts.tv_nsec = 100 * 1000 * 1000;
713 * pselect() with no fds, a timeout, and a sigmask:
714 * sleep for 100ms and react on SIGUSR2.
716 pselect(1, NULL, NULL, NULL, &ts, &set);
719 pthread_cleanup_pop(1);
723 int start_io_err_stat_thread(void *data)
725 if (io_setup(CONCUR_NR_EVENT, &ioctx) != 0) {
726 io_err_stat_log(4, "io_setup failed");
729 paths = alloc_pathvec();
733 if (pthread_create(&io_err_stat_thr, &io_err_stat_attr,
734 io_err_stat_loop, data)) {
735 io_err_stat_log(0, "cannot create io_error statistic thread");
738 io_err_stat_log(3, "thread started");
742 free_io_err_pathvec(paths);
745 io_err_stat_log(0, "failed to start io_error statistic thread");
749 void stop_io_err_stat_thread(void)
751 pthread_cancel(io_err_stat_thr);
752 pthread_kill(io_err_stat_thr, SIGUSR2);
753 pthread_join(io_err_stat_thr, NULL);
754 free_io_err_pathvec(paths);