[patchv2] Fix SIGTERM signal safety (PR gdb/15358) [refresh]
Commit Message
Hi,
the post several minutes ago (not the year old one) had accidentally corrupted
testcase.
------------------------------------------------------------------------------
reposting with no changes, the last post is a year old one, it had been already
approved by Tom that time so I will check it in in some time.
[patch] Fix SIGTERM signal safety (PR gdb/15358)
https://sourceware.org/ml/gdb-patches/2013-07/msg00094.html
Message-ID: <20130702200010.GA23478@host2.jankratochvil.net>
The statement about PR remote/15297 dependency is IMO false, this patch works
on its own. The PR remote/15297 patch remains unfinished/pending, but that is
off-topic here.
------------------------------------------------------------------------------
gdb deadlock due to gdb calling calloc() in signal handler
http://sourceware.org/bugzilla/show_bug.cgi?id=15358
Patch depends in its functionality on:
[patchv2 2/2] Fix CTRL-C for remote.c (PR remote/15297)
http://sourceware.org/ml/gdb-patches/2013-06/msg00943.html
Message-ID: <20130630181110.GB29548@host2.jankratochvil.net>
async mode seems easy.
The sync mode is a bit difficult - assuming it is safe to call quit_force from
any place of QUIT;. OTOH the patch above assumes it can do:
if (check_quit_flag ())
send_interrupt_sequence ();
which clears quit_flag but we used set_quit_flag () to call quit_force, not
just to throw the quit exception. This is why QUIT now checks also for
SYNC_QUIT_FORCE_RUN.
The change in linux-nat.c comes from testing i386-linux-nat.c (therefore
32-bit host GDB). i386_linux_resume there calls QUIT; via target_read ().
This is a bug on its own, filed as:
http://sourceware.org/bugzilla/show_bug.cgi?id=15713
But I have seen another bug in linux-nat.c, it was depending on PTRACE_KILL
but at least Linux kernel ptrace expert Oleg Nesterov considers PTRACE_KILL
superseded by kill(SIGKILL). Therefore I used there (also) more safe SIGKILL
so that the possibly inconsistent state of inferior from i386_linux_resume
does not matter.
No regressions on {x86_64,x86_64-m32,i686}-fedora19pre-linux-gnu and in
gdbserver mode.
Thanks,
Jan
gdb/
2013-07-02 Jan Kratochvil <jan.kratochvil@redhat.com>
PR gdb/15358
* defs.h (sync_quit_force_run): New declaration.
(QUIT): Check also SYNC_QUIT_FORCE_RUN.
* event-top.c (async_sigterm_handler): New declaration.
(async_sigterm_token): New variable.
(async_init_signals): Create also async_sigterm_token.
(async_sigterm_handler): New function.
(sync_quit_force_run): New variable.
(handle_sigterm): Replace quit_force call by other calls.
* linux-nat.c (linux_nat_kill): Use kill_callback first.
Extend the comment for stop_callback.
* utils.c (quit): Call quit_force if SYNC_QUIT_FORCE_RUN.
gdb/testsuite/
2013-07-02 Jan Kratochvil <jan.kratochvil@redhat.com>
PR gdb/15358
* gdb.base/gdb-sigterm.c: New file.
* gdb.base/gdb-sigterm.exp: New file.
Comments
Jan Kratochvil writes:
> [...]
Hi.
I have some comments and requests for this patch.
A high level comment is that I guess this patch means it's
now more important to fix the spots of gdb where QUITs aren't
called for long times (I was working on a common one, debug info
reading, but had to defer that to work on something else).
> The sync mode is a bit difficult - assuming it is safe to call quit_force from
> any place of QUIT;. OTOH the patch above assumes it can do:
> if (check_quit_flag ())
> send_interrupt_sequence ();
> which clears quit_flag but we used set_quit_flag () to call quit_force, not
> just to throw the quit exception. This is why QUIT now checks also for
> SYNC_QUIT_FORCE_RUN.
>
> The change in linux-nat.c comes from testing i386-linux-nat.c (therefore
> 32-bit host GDB). i386_linux_resume there calls QUIT; via target_read ().
> This is a bug on its own, filed as:
> http://sourceware.org/bugzilla/show_bug.cgi?id=15713
>
> But I have seen another bug in linux-nat.c, it was depending on PTRACE_KILL
> but at least Linux kernel ptrace expert Oleg Nesterov considers PTRACE_KILL
> superseded by kill(SIGKILL). Therefore I used there (also) more safe SIGKILL
> so that the possibly inconsistent state of inferior from i386_linux_resume
> does not matter.
>
>
> No regressions on {x86_64,x86_64-m32,i686}-fedora19pre-linux-gnu and in
> gdbserver mode.
>
>
> Thanks,
> Jan
>
>
> gdb/
> 2013-07-02 Jan Kratochvil <jan.kratochvil@redhat.com>
>
> PR gdb/15358
> * defs.h (sync_quit_force_run): New declaration.
> (QUIT): Check also SYNC_QUIT_FORCE_RUN.
> * event-top.c (async_sigterm_handler): New declaration.
> (async_sigterm_token): New variable.
> (async_init_signals): Create also async_sigterm_token.
> (async_sigterm_handler): New function.
> (sync_quit_force_run): New variable.
> (handle_sigterm): Replace quit_force call by other calls.
> * linux-nat.c (linux_nat_kill): Use kill_callback first.
> Extend the comment for stop_callback.
> * utils.c (quit): Call quit_force if SYNC_QUIT_FORCE_RUN.
>
> gdb/testsuite/
> 2013-07-02 Jan Kratochvil <jan.kratochvil@redhat.com>
>
> PR gdb/15358
> * gdb.base/gdb-sigterm.c: New file.
> * gdb.base/gdb-sigterm.exp: New file.
>
> diff --git a/gdb/defs.h b/gdb/defs.h
> index 480133d..47da43a 100644
> --- a/gdb/defs.h
> +++ b/gdb/defs.h
> @@ -171,6 +171,9 @@ extern int check_quit_flag (void);
> /* * Set the quit flag. */
> extern void set_quit_flag (void);
>
> +/* Flag that function quit should call quit_force. */
> +extern volatile int sync_quit_force_run;
> +
> extern int immediate_quit;
>
> extern void quit (void);
> @@ -183,7 +186,7 @@ extern void quit (void);
> needed. */
>
> #define QUIT { \
> - if (check_quit_flag ()) quit (); \
> + if (check_quit_flag () || sync_quit_force_run) quit (); \
> if (deprecated_interactive_hook) deprecated_interactive_hook (); \
> }
>
> diff --git a/gdb/event-top.c b/gdb/event-top.c
> index fbe89bd..af1562c 100644
> --- a/gdb/event-top.c
> +++ b/gdb/event-top.c
> @@ -72,6 +72,7 @@ static void async_float_handler (gdb_client_data);
> #ifdef STOP_SIGNAL
> static void async_stop_sig (gdb_client_data);
> #endif
> +static void async_sigterm_handler (gdb_client_data arg);
>
> /* Readline offers an alternate interface, via callback
> functions. These are all included in the file callback.c in the
> @@ -135,6 +136,7 @@ static struct async_signal_handler *sigfpe_token;
> #ifdef STOP_SIGNAL
> static struct async_signal_handler *sigtstp_token;
> #endif
> +static struct async_signal_handler *async_sigterm_token;
>
> /* Structure to save a partially entered command. This is used when
> the user types '\' at the end of a command line. This is necessary
> @@ -770,6 +772,8 @@ async_init_signals (void)
> create_async_signal_handler (async_stop_sig, NULL);
> #endif
>
> + async_sigterm_token =
> + create_async_signal_handler (async_sigterm_handler, NULL);
> }
I realize SIGTERM isn't handled exactly like SIGINT, but IWBN
to keep all the SIGTERM bits together in this function.
How about moving the above snippet to after this:
signal (SIGINT, handle_sigint);
sigint_token =
create_async_signal_handler (async_request_quit, NULL);
signal (SIGTERM, handle_sigterm);
btw, the rule has been effectively changed to require = being
put on the next line.
While I certainly wouldn't ask for whitespace change in this patch,
[IOW I'm not asking for the existing code to be fixed]
I'm curious what the rule is for handling this here.
For the patch at hand, should it be written as this:
async_sigterm_token =
create_async_signal_handler (async_sigterm_handler, NULL);
or this:
async_sigterm_token
= create_async_signal_handler (async_sigterm_handler, NULL);
?
I honestly don't know.
It's not important of course, just wondering.
>
> /* Tell the event loop what to do if SIGINT is received.
> @@ -797,13 +801,31 @@ handle_sigint (int sig)
> gdb_call_async_signal_handler (sigint_token, immediate_quit);
> }
>
> +/* Handle GDB exit upon receiving SIGTERM if target_can_async_p (). */
> +
> +static void
> +async_sigterm_handler (gdb_client_data arg)
> +{
> + quit_force (NULL, stdin == instream);
> +}
> +
> +/* See defs.h. */
> +volatile int sync_quit_force_run;
> +
> /* Quit GDB if SIGTERM is received.
> GDB would quit anyway, but this way it will clean up properly. */
> void
> handle_sigterm (int sig)
> {
> signal (sig, handle_sigterm);
> - quit_force ((char *) 0, stdin == instream);
I think we need a comment here explaining *why* we can't
just call quit_force here.
> +
> + if (target_can_async_p ())
> + mark_async_signal_handler (async_sigterm_token);
> + else
> + {
> + sync_quit_force_run = 1;
> + set_quit_flag ();
> + }
> }
>
> /* Do the quit. All the checks have been done by the caller. */
> diff --git a/gdb/linux-nat.c b/gdb/linux-nat.c
> index b615423..ec84188 100644
> --- a/gdb/linux-nat.c
> +++ b/gdb/linux-nat.c
> @@ -3777,8 +3777,15 @@ linux_nat_kill (struct target_ops *ops)
> {
> ptid_t ptid = pid_to_ptid (ptid_get_pid (inferior_ptid));
>
> + /* Kill all LWP's before trying to stop them. In rare cases the
> + lwp_info state may not match the inferior and
> + stop_wait_callback could lock up. */
> + iterate_over_lwps (ptid, kill_callback, NULL);
> +
> /* Stop all threads before killing them, since ptrace requires
> - that the thread is stopped to sucessfully PTRACE_KILL. */
> + that the thread is stopped to sucessfully PTRACE_KILL.
> + kill_callback normally already turned the inferior into a zombie
> + except for old Linux kernels 2.4.x. */
> iterate_over_lwps (ptid, stop_callback, NULL);
> /* ... and wait until all of them have reported back that
> they're no longer running. */
This part feels sufficiently outside the scope of this patch
that IWBN if this were a separate patch.
> diff --git a/gdb/utils.c b/gdb/utils.c
> index 9d068c5..364470c 100644
> --- a/gdb/utils.c
> +++ b/gdb/utils.c
> @@ -999,6 +999,12 @@ print_sys_errmsg (const char *string, int errcode)
> void
> quit (void)
> {
> + if (sync_quit_force_run)
> + {
> + sync_quit_force_run = 0;
> + quit_force (NULL, stdin == instream);
> + }
> +
Bleah. Not your problem, and obviously I'm not suggesting
fixing the names in this patch, but "quit" is overloaded.
quit is really quit_current_command (or some such),
and quit_force is really quit_and_exit (or some such).
I raise the issue because the patch is conflating them,
and I'm wondering if that's a good idea.
> [...]
>
> diff --git a/gdb/testsuite/gdb.base/gdb-sigterm.exp b/gdb/testsuite/gdb.base/gdb-sigterm.exp
> new file mode 100644
> index 0000000..8baeb96
> --- /dev/null
> +++ b/gdb/testsuite/gdb.base/gdb-sigterm.exp
> @@ -0,0 +1,94 @@
> +# This testcase is part of GDB, the GNU debugger.
> +#
> +# Copyright 2013 Free Software Foundation, Inc.
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 3 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program. If not, see <http://www.gnu.org/licenses/>.
> +
> +standard_testfile
> +
> +if { [build_executable ${testfile}.exp ${testfile}] == -1 } {
> + return -1
> +}
> +
> +proc do_test { pass } {
> + global testfile gdb_prompt binfile pf_prefix
> +
> + if ![runto_main] {
> + return -1
> + }
> +
> + gdb_breakpoint "${testfile}.c:[gdb_get_line_number "loop-line" ${testfile}.c]" \
> + temporary
> +
> + # gdb_continue_to_breakpoint would print a pass message.
> + gdb_test "continue" "Temporary breakpoint .* loop-line .*" ""
> +
> + gdb_test_no_output "set range-stepping off" ""
> + gdb_test_no_output "set debug infrun 1" ""
> + gdb_test_no_output "set debug lin-lwp 1" ""
> +
> + set test "run a bit #$pass"
> + set abort 1
> + gdb_test_multiple "step" $test {
> + -re "infrun: stepping inside range" {
> + # Suppress pass $test
> + verbose -log "$pf_prefix $test"
> + set abort 0
> + }
> + }
> + if $abort {
> + return
> + }
> +
> + set gdb_pid [exp_pid -i [board_info host fileid]]
> + remote_exec host "kill -TERM ${gdb_pid}"
> +
> + set test "expect eof #$pass"
> + set abort 1
> + set stepping 0
> + gdb_test_multiple "" $test {
> + eof {
> + verbose -log "$pf_prefix $test"
> + set abort 0
> + }
> + -re "infrun: stepping inside range" {
> + incr stepping
> + if { $stepping > 200 } {
> + fail "$test (stepping inside range $stepping times)"
> + } else {
> + exp_continue
> + }
> + }
> + }
> + if $abort {
> + return
> + }
> +}
> +
> +
> +for {set pass 0} {$pass < 50} {incr pass} {
How come 50 passes?
At the least a comment is required explaining why.
> +
> + clean_restart ${testfile}
> + gdb_test_no_output "set target-async off" ""
> + with_test_prefix "sync" {
> + do_test $pass
> + }
> +
> + clean_restart ${testfile}
> + gdb_test_no_output "set target-async on" ""
> + with_test_prefix "async" {
> + do_test $pass
> + }
> +}
> +pass "$pass SIGTERM passes"
@@ -171,6 +171,9 @@ extern int check_quit_flag (void);
/* * Set the quit flag. */
extern void set_quit_flag (void);
+/* Flag that function quit should call quit_force. */
+extern volatile int sync_quit_force_run;
+
extern int immediate_quit;
extern void quit (void);
@@ -183,7 +186,7 @@ extern void quit (void);
needed. */
#define QUIT { \
- if (check_quit_flag ()) quit (); \
+ if (check_quit_flag () || sync_quit_force_run) quit (); \
if (deprecated_interactive_hook) deprecated_interactive_hook (); \
}
@@ -72,6 +72,7 @@ static void async_float_handler (gdb_client_data);
#ifdef STOP_SIGNAL
static void async_stop_sig (gdb_client_data);
#endif
+static void async_sigterm_handler (gdb_client_data arg);
/* Readline offers an alternate interface, via callback
functions. These are all included in the file callback.c in the
@@ -135,6 +136,7 @@ static struct async_signal_handler *sigfpe_token;
#ifdef STOP_SIGNAL
static struct async_signal_handler *sigtstp_token;
#endif
+static struct async_signal_handler *async_sigterm_token;
/* Structure to save a partially entered command. This is used when
the user types '\' at the end of a command line. This is necessary
@@ -770,6 +772,8 @@ async_init_signals (void)
create_async_signal_handler (async_stop_sig, NULL);
#endif
+ async_sigterm_token =
+ create_async_signal_handler (async_sigterm_handler, NULL);
}
/* Tell the event loop what to do if SIGINT is received.
@@ -797,13 +801,31 @@ handle_sigint (int sig)
gdb_call_async_signal_handler (sigint_token, immediate_quit);
}
+/* Handle GDB exit upon receiving SIGTERM if target_can_async_p (). */
+
+static void
+async_sigterm_handler (gdb_client_data arg)
+{
+ quit_force (NULL, stdin == instream);
+}
+
+/* See defs.h. */
+volatile int sync_quit_force_run;
+
/* Quit GDB if SIGTERM is received.
GDB would quit anyway, but this way it will clean up properly. */
void
handle_sigterm (int sig)
{
signal (sig, handle_sigterm);
- quit_force ((char *) 0, stdin == instream);
+
+ if (target_can_async_p ())
+ mark_async_signal_handler (async_sigterm_token);
+ else
+ {
+ sync_quit_force_run = 1;
+ set_quit_flag ();
+ }
}
/* Do the quit. All the checks have been done by the caller. */
@@ -3777,8 +3777,15 @@ linux_nat_kill (struct target_ops *ops)
{
ptid_t ptid = pid_to_ptid (ptid_get_pid (inferior_ptid));
+ /* Kill all LWP's before trying to stop them. In rare cases the
+ lwp_info state may not match the inferior and
+ stop_wait_callback could lock up. */
+ iterate_over_lwps (ptid, kill_callback, NULL);
+
/* Stop all threads before killing them, since ptrace requires
- that the thread is stopped to sucessfully PTRACE_KILL. */
+ that the thread is stopped to sucessfully PTRACE_KILL.
+ kill_callback normally already turned the inferior into a zombie
+ except for old Linux kernels 2.4.x. */
iterate_over_lwps (ptid, stop_callback, NULL);
/* ... and wait until all of them have reported back that
they're no longer running. */
@@ -999,6 +999,12 @@ print_sys_errmsg (const char *string, int errcode)
void
quit (void)
{
+ if (sync_quit_force_run)
+ {
+ sync_quit_force_run = 0;
+ quit_force (NULL, stdin == instream);
+ }
+
#ifdef __MSDOS__
/* No steenking SIGINT will ever be coming our way when the
program is resumed. Don't lie. */
new file mode 100644
@@ -0,0 +1,26 @@
+/* This testcase is part of GDB, the GNU debugger.
+
+ Copyright 2013 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <unistd.h>
+
+int
+main (void)
+{
+ alarm (60);
+
+ for (;;); /* loop-line */
+}
new file mode 100644
@@ -0,0 +1,94 @@
+# This testcase is part of GDB, the GNU debugger.
+#
+# Copyright 2013 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+standard_testfile
+
+if { [build_executable ${testfile}.exp ${testfile}] == -1 } {
+ return -1
+}
+
+proc do_test { pass } {
+ global testfile gdb_prompt binfile pf_prefix
+
+ if ![runto_main] {
+ return -1
+ }
+
+ gdb_breakpoint "${testfile}.c:[gdb_get_line_number "loop-line" ${testfile}.c]" \
+ temporary
+
+ # gdb_continue_to_breakpoint would print a pass message.
+ gdb_test "continue" "Temporary breakpoint .* loop-line .*" ""
+
+ gdb_test_no_output "set range-stepping off" ""
+ gdb_test_no_output "set debug infrun 1" ""
+ gdb_test_no_output "set debug lin-lwp 1" ""
+
+ set test "run a bit #$pass"
+ set abort 1
+ gdb_test_multiple "step" $test {
+ -re "infrun: stepping inside range" {
+ # Suppress pass $test
+ verbose -log "$pf_prefix $test"
+ set abort 0
+ }
+ }
+ if $abort {
+ return
+ }
+
+ set gdb_pid [exp_pid -i [board_info host fileid]]
+ remote_exec host "kill -TERM ${gdb_pid}"
+
+ set test "expect eof #$pass"
+ set abort 1
+ set stepping 0
+ gdb_test_multiple "" $test {
+ eof {
+ verbose -log "$pf_prefix $test"
+ set abort 0
+ }
+ -re "infrun: stepping inside range" {
+ incr stepping
+ if { $stepping > 200 } {
+ fail "$test (stepping inside range $stepping times)"
+ } else {
+ exp_continue
+ }
+ }
+ }
+ if $abort {
+ return
+ }
+}
+
+
+for {set pass 0} {$pass < 50} {incr pass} {
+
+ clean_restart ${testfile}
+ gdb_test_no_output "set target-async off" ""
+ with_test_prefix "sync" {
+ do_test $pass
+ }
+
+ clean_restart ${testfile}
+ gdb_test_no_output "set target-async on" ""
+ with_test_prefix "async" {
+ do_test $pass
+ }
+}
+pass "$pass SIGTERM passes"