This commit teaches GDB's gcore command to generate sparse core files
(if supported by the filesystem).
To create a sparse file, all you have to do is skip writing zeros to
the file, instead lseek'ing-ahead over them.
The sparse logic is applied when writing the memory sections, as
that's where the bulk of the data and the zeros are.
The commit also tweaks gdb.base/bigcore.exp to make it exercise
gdb-generated cores in addition to kernel-generated cores. We
couldn't do that before, because GDB's gcore on that test's program
would generate a multi-GB non-sparse core (16GB on my system).
After this commit, gdb.base/bigcore.exp generates, when testing with
GDB's gcore, a much smaller core file, roughly in line with what the
kernel produces:
real sizes:
$ du --hu testsuite/outputs/gdb.base/bigcore/bigcore.corefile.*
2.2M testsuite/outputs/gdb.base/bigcore/bigcore.corefile.gdb
2.0M testsuite/outputs/gdb.base/bigcore/bigcore.corefile.kernel
apparent sizes:
$ du --hu --apparent-size testsuite/outputs/gdb.base/bigcore/bigcore.corefile.*
16G testsuite/outputs/gdb.base/bigcore/bigcore.corefile.gdb
16G testsuite/outputs/gdb.base/bigcore/bigcore.corefile.kernel
Time to generate the core also goes down significantly. On my machine, I get:
when writing to an SSD, from 21.0s, down to 8.0s
when writing to an HDD, from 31.0s, down to 8.5s
The changes to gdb.base/bigcore.exp are smaller than they look at
first sight. It's basically mostly refactoring -- moving most of the
code to a new procedure which takes as argument who should dump the
core, and then calling the procedure twice. I purposedly did not
modernize any of the refactored code in this patch.
Bug: https://sourceware.org/bugzilla/show_bug.cgi?id=31494
Reviewed-By: Lancelot Six <lancelot.six@amd.com>
Change-Id: I2554a6a4a72d8c199ce31f176e0ead0c0c76cff1
---
gdb/NEWS | 4 +
gdb/doc/gdb.texinfo | 3 +
gdb/gcore.c | 115 +++++++++++++-
gdb/testsuite/gdb.base/bigcore.exp | 238 ++++++++++++++++-------------
4 files changed, 252 insertions(+), 108 deletions(-)
base-commit: d01264b71654dae83a254cf030a5cf73b66b7fbb
@@ -23,6 +23,10 @@ disassemble
command will now give an error. Previously the 'b' flag would
always override the 'r' flag.
+gcore
+generate-core-file
+ GDB now generates sparse core files, on systems that support it.
+
maintenance info line-table
Add an EPILOGUE-BEGIN column to the output of the command. It indicates
if the line is considered the start of the epilgoue, and thus a point at
@@ -13867,6 +13867,9 @@ Produce a core dump of the inferior process. The optional argument
specified, the file name defaults to @file{core.@var{pid}}, where
@var{pid} is the inferior process ID.
+If supported by the filesystem where the core is written to,
+@value{GDBN} generates a sparse core dump file.
+
Note that this command is implemented only for some systems (as of
this writing, @sc{gnu}/Linux, FreeBSD, Solaris, and S390).
@@ -39,10 +39,18 @@
#include "gdbsupport/byte-vector.h"
#include "gdbsupport/scope-exit.h"
+/* To generate sparse cores, we look at the data to write in chunks of
+ this size when considering whether to skip the write. Only if we
+ have a full block of this size with all zeros do we skip writing
+ it. A simpler algorithm that would try to skip all zeros would
+ result in potentially many more write/lseek syscalls, as normal
+ data is typically sprinkled with many small holes of zeros. */
+#define SPARSE_BLOCK_SIZE 0x1000
+
/* The largest amount of memory to read from the target at once. We
must throttle it to limit the amount of memory used by GDB during
generate-core-file for programs with large resident data. */
-#define MAX_COPY_BYTES (1024 * 1024)
+#define MAX_COPY_BYTES (256 * SPARSE_BLOCK_SIZE)
static const char *default_gcore_target (void);
static enum bfd_architecture default_gcore_arch (void);
@@ -98,7 +106,12 @@ write_gcore_file_1 (bfd *obfd)
bfd_set_section_alignment (note_sec, 0);
bfd_set_section_size (note_sec, note_size);
- /* Now create the memory/load sections. */
+ /* Now create the memory/load sections. Note
+ gcore_memory_sections's sparse logic is assuming that we'll
+ always write something afterwards, which we do: just below, we
+ write the note section. So there's no need for an ftruncate-like
+ call to grow the file to the right size if the last memory
+ sections were zeros and we skipped writing them. */
if (gcore_memory_sections (obfd) == 0)
error (_("gcore: failed to get corefile memory sections from target."));
@@ -567,6 +580,99 @@ objfile_find_memory_regions (struct target_ops *self,
return 0;
}
+/* Return true if we have a block full of zeros at DATA within the
+ [DATA, DATA+SIZE) buffer, false otherwise. */
+
+static bool
+is_all_zero_block (const gdb_byte *data, size_t size)
+{
+ if (size < SPARSE_BLOCK_SIZE)
+ return false;
+
+ /* A memcmp of a whole block is much faster than a simple for loop.
+ This makes a big difference, as with a for loop, this code would
+ dominate the performance and result in doubling the time to
+ generate a core, at the time of writing. With an optimized
+ memcmp, this doesn't even show up in the perf trace. */
+ static const gdb_byte all_zero_block[SPARSE_BLOCK_SIZE] = {};
+ return memcmp (data, all_zero_block, SPARSE_BLOCK_SIZE) == 0;
+}
+
+/* Find the next all-zero block at DATA+OFFSET within the [DATA,
+ DATA+SIZE) buffer. Returns the offset to the all-zero block if
+ found, or zero if not found. */
+
+static size_t
+next_all_zero_block (const gdb_byte *data, size_t offset, size_t size)
+{
+ for (; offset < size; offset += SPARSE_BLOCK_SIZE)
+ if (is_all_zero_block (data + offset, size - offset))
+ return offset;
+ return 0;
+}
+
+/* Wrapper around bfd_set_section_contents that avoids writing
+ all-zero blocks to disk, so we create a sparse core file. */
+
+static bool
+sparse_bfd_set_section_contents (bfd *obfd, asection *osec,
+ const gdb_byte *data,
+ size_t sec_offset,
+ size_t size)
+{
+ /* Note, we don't have to have special handling for the case of the
+ last memory region ending with zeros, because our caller always
+ writes out the note section after the memory/load sections. If
+ it didn't, we'd have to seek+write the last byte to make the file
+ size correct. (Or add an ftruncate abstraction to bfd and call
+ that.)
+
+ If the blocks we skip are not aligned with the filesystem blocks,
+ on filesystems with fixed blocked size, we may waste a tiny bit
+ of file size, as the blocks that are adjacent to all-zero blocks
+ will have to include an amount of zeros. However, in practice,
+ sections have some alignment, and so SEC_OFFSET will be aligned
+ too, as our caller reads-in memory in chunks of SPARSE_BLOCK_SIZE
+ multiples. It's just not worth the bother to worry about
+ alignment here. */
+ size_t data_offset = 0;
+ while (data_offset < size)
+ {
+ if (is_all_zero_block (data + data_offset, size - data_offset))
+ data_offset += SPARSE_BLOCK_SIZE;
+ else
+ {
+ /* We have some non-zero data to write to file. Find the
+ next all-zero block within the data, and only write up to
+ it. */
+ size_t next_all_zero_block_offset
+ = next_all_zero_block (data, data_offset + SPARSE_BLOCK_SIZE, size);
+ size_t next_data_offset = (next_all_zero_block_offset == 0
+ ? size
+ : next_all_zero_block_offset);
+
+ if (!bfd_set_section_contents (obfd, osec, data + data_offset,
+ sec_offset + data_offset,
+ next_data_offset - data_offset))
+ {
+ warning (_("Failed to write corefile contents (%s)."),
+ bfd_errmsg (bfd_get_error ()));
+ return false;
+ }
+
+ data_offset = next_data_offset;
+
+ /* If we already know we have an all-zero block at the next
+ offset, we can skip calling is_all_zero_block for it
+ again. */
+ if (next_all_zero_block_offset != 0)
+ data_offset += SPARSE_BLOCK_SIZE;
+ }
+ }
+
+ return true;
+}
+
static void
gcore_copy_callback (bfd *obfd, asection *osec)
{
@@ -599,8 +705,9 @@ gcore_copy_callback (bfd *obfd, asection *osec)
bfd_section_vma (osec)));
break;
}
- if (!bfd_set_section_contents (obfd, osec, memhunk.data (),
- offset, size))
+
+ if (!sparse_bfd_set_section_contents (obfd, osec, memhunk.data (),
+ offset, size))
{
warning (_("Failed to write corefile contents (%s)."),
bfd_errmsg (bfd_get_error ()));
@@ -43,23 +43,6 @@ if { [gdb_compile "${srcdir}/${subdir}/${srcfile}" "${binfile}" executable {deb
return -1
}
-# Run GDB on the bigcore program up-to where it will dump core.
-
-clean_restart ${binfile}
-gdb_test_no_output "set print sevenbit-strings"
-gdb_test_no_output "set width 0"
-
-# Get the core into the output directory.
-set_inferior_cwd_to_output_dir
-
-if {![runto_main]} {
- return 0
-}
-set print_core_line [gdb_get_line_number "Dump core"]
-gdb_test "tbreak $print_core_line"
-gdb_test continue ".*print_string.*"
-gdb_test next ".*0 = 0.*"
-
# Traverse part of bigcore's linked list of memory chunks (forward or
# backward), saving each chunk's address.
@@ -92,92 +75,11 @@ proc extract_heap { dir } {
}
return $heap
}
-set next_heap [extract_heap next]
-set prev_heap [extract_heap prev]
-
-# Save the total allocated size within GDB so that we can check
-# the core size later.
-gdb_test_no_output "set \$bytes_allocated = bytes_allocated" "save heap size"
-
-# Now create a core dump
-
-# Rename the core file to "TESTFILE.corefile" rather than just "core",
-# to avoid problems with sys admin types that like to regularly prune
-# all files named "core" from the system.
-
-# Some systems append "core" to the name of the program; others append
-# the name of the program to "core"; still others (like Linux, as of
-# May 2003) create cores named "core.PID".
-
-# Save the process ID. Some systems dump the core into core.PID.
-set inferior_pid [get_inferior_pid]
-
-# Dump core using SIGABRT
-set oldtimeout $timeout
-set timeout 600
-gdb_test "signal SIGABRT" "Program terminated with signal SIGABRT, .*"
-set timeout $oldtimeout
-
-# Find the corefile.
-set file [find_core_file $inferior_pid]
-if { $file != "" } {
- remote_exec build "mv $file $corefile"
-} else {
- untested "can't generate a core file"
- return 0
-}
-# Check that the corefile is plausibly large enough. We're trying to
-# detect the case where the operating system has truncated the file
-# just before signed wraparound. TCL, unfortunately, has a similar
-# problem - so use catch. It can handle the "bad" size but not
-# necessarily the "good" one. And we must use GDB for the comparison,
-# similarly.
-
-if {[catch {file size $corefile} core_size] == 0} {
- set core_ok 0
- gdb_test_multiple "print \$bytes_allocated < $core_size" "check core size" {
- -re " = 1\r\n$gdb_prompt $" {
- pass "check core size"
- set core_ok 1
- }
- -re " = 0\r\n$gdb_prompt $" {
- pass "check core size"
- set core_ok 0
- }
- }
-} {
- # Probably failed due to the TCL build having problems with very
- # large values. Since GDB uses a 64-bit off_t (when possible) it
- # shouldn't have this problem. Assume that things are going to
- # work. Without this assumption the test is skiped on systems
- # (such as i386 GNU/Linux with patched kernel) which do pass.
- pass "check core size"
- set core_ok 1
-}
-if {! $core_ok} {
- untested "check core size (system does not support large corefiles)"
- return 0
-}
-
-# Now load up that core file
-
-set test "load corefile"
-gdb_test_multiple "core $corefile" "$test" {
- -re "A program is being debugged already. Kill it. .y or n. " {
- send_gdb "y\n"
- exp_continue
- }
- -re "Core was generated by.*$gdb_prompt $" {
- pass "$test"
- }
-}
-
-# Finally, re-traverse bigcore's linked list, checking each chunk's
-# address against the executable. Don't use gdb_test_multiple as want
-# only one pass/fail. Don't use exp_continue as the regular
-# expression involving $heap needs to be re-evaluated for each new
-# response.
+# Re-traverse bigcore's linked list, checking each chunk's address
+# against the executable. Don't use gdb_test_multiple as want only
+# one pass/fail. Don't use exp_continue as the regular expression
+# involving $heap needs to be re-evaluated for each new response.
proc check_heap { dir heap } {
global gdb_prompt
@@ -208,5 +110,133 @@ proc check_heap { dir heap } {
}
}
-check_heap next $next_heap
-check_heap prev $prev_heap
+# The bulk of the testcase. DUMPER indicates who is supposed to dump
+# the core. It can be either "kernel", or "gdb".
+proc test {dumper} {
+ global binfile timeout corefile gdb_prompt
+
+ # Run GDB on the bigcore program up-to where it will dump core.
+
+ clean_restart ${binfile}
+ gdb_test_no_output "set print sevenbit-strings"
+ gdb_test_no_output "set width 0"
+
+ # Get the core into the output directory.
+ set_inferior_cwd_to_output_dir
+
+ if {![runto_main]} {
+ return 0
+ }
+ set print_core_line [gdb_get_line_number "Dump core"]
+ gdb_test "tbreak $print_core_line"
+ gdb_test continue ".*print_string.*"
+ gdb_test next ".*0 = 0.*"
+
+ set next_heap [extract_heap next]
+ set prev_heap [extract_heap prev]
+
+ # Save the total allocated size within GDB so that we can check
+ # the core size later.
+ gdb_test_no_output "set \$bytes_allocated = bytes_allocated" \
+ "save heap size"
+
+ # Now create a core dump.
+
+ if {$dumper == "kernel"} {
+ # Rename the core file to "TESTFILE.corefile.$dumper" rather
+ # than just "core", to avoid problems with sys admin types
+ # that like to regularly prune all files named "core" from the
+ # system.
+
+ # Some systems append "core" to the name of the program;
+ # others append the name of the program to "core"; still
+ # others (like Linux, as of May 2003) create cores named
+ # "core.PID".
+
+ # Save the process ID. Some systems dump the core into
+ # core.PID.
+ set inferior_pid [get_inferior_pid]
+
+ # Dump core using SIGABRT.
+ set oldtimeout $timeout
+ set timeout 600
+ gdb_test "signal SIGABRT" "Program terminated with signal SIGABRT, .*"
+ set timeout $oldtimeout
+
+ # Find the corefile.
+ set file [find_core_file $inferior_pid]
+ if { $file != "" } {
+ remote_exec build "mv $file $corefile.$dumper"
+ } else {
+ untested "can't generate a core file"
+ return 0
+ }
+ } elseif {$dumper == "gdb"} {
+ gdb_gcore_cmd "$corefile.$dumper" "gcore corefile"
+ } else {
+ error "unhandled dumper: $dumper"
+ }
+
+ # Check that the corefile is plausibly large enough. We're trying
+ # to detect the case where the operating system has truncated the
+ # file just before signed wraparound. TCL, unfortunately, has a
+ # similar problem - so use catch. It can handle the "bad" size
+ # but not necessarily the "good" one. And we must use GDB for the
+ # comparison, similarly.
+
+ if {[catch {file size $corefile.$dumper} core_size] == 0} {
+ set core_ok 0
+ gdb_test_multiple "print \$bytes_allocated < $core_size" \
+ "check core size" {
+ -re " = 1\r\n$gdb_prompt $" {
+ pass "check core size"
+ set core_ok 1
+ }
+ -re " = 0\r\n$gdb_prompt $" {
+ pass "check core size"
+ set core_ok 0
+ }
+ }
+ } {
+ # Probably failed due to the TCL build having problems with
+ # very large values. Since GDB uses a 64-bit off_t (when
+ # possible) it shouldn't have this problem. Assume that
+ # things are going to work. Without this assumption the test
+ # is skiped on systems (such as i386 GNU/Linux with patched
+ # kernel) which do pass.
+ pass "check core size"
+ set core_ok 1
+ }
+ if {! $core_ok} {
+ untested "check core size (system does not support large corefiles)"
+ return 0
+ }
+
+ # Now load up that core file.
+
+ set test "load corefile"
+ gdb_test_multiple "core $corefile.$dumper" "$test" {
+ -re "A program is being debugged already. Kill it. .y or n. " {
+ send_gdb "y\n"
+ exp_continue
+ }
+ -re "Core was generated by.*$gdb_prompt $" {
+ pass "$test"
+ }
+ }
+
+ # Finally, re-traverse bigcore's linked list, checking each
+ # chunk's address against the executable.
+
+ check_heap next $next_heap
+ check_heap prev $prev_heap
+}
+
+foreach_with_prefix dumper {kernel gdb} {
+ # GDB's gcore is too slow when testing with the extended-gdbserver
+ # board, since it requires reading all the inferior memory.
+ if {$dumper == "gdb" && [target_info gdb_protocol] != ""} {
+ continue
+ }
+ test $dumper
+}