@@ -441,8 +441,13 @@ eu_ZIPLIB(bzlib,BZLIB,bz2,BZ2_bzdopen,bzip2)
# We need this since bzip2 doesn't have a pkgconfig file.
BZ2_LIB="$LIBS"
AC_SUBST([BZ2_LIB])
+save_LIBS="$LIBS"
+LIBS=
eu_ZIPLIB(lzma,LZMA,lzma,lzma_auto_decoder,[LZMA (xz)])
+lzma_LIBS="$LIBS"
+LIBS="$lzma_LIBS $save_LIBS"
AS_IF([test "x$with_lzma" = xyes], [LIBLZMA="liblzma"], [LIBLZMA=""])
+AC_SUBST([lzma_LIBS])
AC_SUBST([LIBLZMA])
eu_ZIPLIB(zstd,ZSTD,zstd,ZSTD_decompress,[ZSTD (zst)])
AS_IF([test "x$with_zstd" = xyes], [LIBZSTD="libzstd"], [LIBLZSTD=""])
@@ -70,7 +70,7 @@ bin_PROGRAMS += debuginfod-find
endif
debuginfod_SOURCES = debuginfod.cxx
-debuginfod_LDADD = $(libdw) $(libelf) $(libeu) $(libdebuginfod) $(argp_LDADD) $(fts_LIBS) $(libmicrohttpd_LIBS) $(sqlite3_LIBS) $(libarchive_LIBS) $(rpm_LIBS) $(jsonc_LIBS) $(libcurl_LIBS) -lpthread -ldl
+debuginfod_LDADD = $(libdw) $(libelf) $(libeu) $(libdebuginfod) $(argp_LDADD) $(fts_LIBS) $(libmicrohttpd_LIBS) $(sqlite3_LIBS) $(libarchive_LIBS) $(rpm_LIBS) $(jsonc_LIBS) $(libcurl_LIBS) $(lzma_LIBS) -lpthread -ldl
debuginfod_find_SOURCES = debuginfod-find.c
debuginfod_find_LDADD = $(libdw) $(libelf) $(libeu) $(libdebuginfod) $(argp_LDADD) $(fts_LIBS) $(jsonc_LIBS)
@@ -63,6 +63,10 @@ extern "C" {
#undef __attribute__ /* glibc bug - rhbz 1763325 */
#endif
+#ifdef USE_LZMA
+#include <lzma.h>
+#endif
+
#include <unistd.h>
#include <stdlib.h>
#include <locale.h>
@@ -162,7 +166,7 @@ string_endswith(const string& haystack, const string& needle)
// Roll this identifier for every sqlite schema incompatibility.
-#define BUILDIDS "buildids10"
+#define BUILDIDS "buildids11"
#if SQLITE_VERSION_NUMBER >= 3008000
#define WITHOUT_ROWID "without rowid"
@@ -239,15 +243,18 @@ static const char DEBUGINFOD_SQLITE_DDL[] =
" debuginfo_p integer not null,\n"
" executable_p integer not null,\n"
" file integer not null,\n"
- " mtime integer not null,\n"
+ " mtime0 integer not null,\n"
" content integer not null,\n"
+ " size integer not null,\n"
+ " mtime1 integer not null,\n"
+ " uncompressed_offset integer,\n"
" foreign key (file) references " BUILDIDS "_files(id) on update cascade on delete cascade,\n"
" foreign key (content) references " BUILDIDS "_files(id) on update cascade on delete cascade,\n"
" foreign key (buildid) references " BUILDIDS "_buildids(id) on update cascade on delete cascade,\n"
- " primary key (buildid, debuginfo_p, executable_p, file, content, mtime)\n"
+ " primary key (buildid, debuginfo_p, executable_p, file, content, mtime0)\n"
" ) " WITHOUT_ROWID ";\n"
// Index for faster delete by archive file identifier
- "create index if not exists " BUILDIDS "_r_de_idx on " BUILDIDS "_r_de (file, mtime);\n"
+ "create index if not exists " BUILDIDS "_r_de_idx on " BUILDIDS "_r_de (file, mtime0);\n"
// Index for metadata searches
"create index if not exists " BUILDIDS "_r_de_idx2 on " BUILDIDS "_r_de (content);\n"
"create table if not exists " BUILDIDS "_r_sref (\n" // outgoing dwarf sourcefile references from rpm
@@ -268,22 +275,22 @@ static const char DEBUGINFOD_SQLITE_DDL[] =
// create views to glue together some of the above tables, for webapi D queries
"create view if not exists " BUILDIDS "_query_d as \n"
"select\n"
- " b.hex as buildid, n.mtime, 'F' as sourcetype, f0.name as source0, n.mtime as mtime, null as source1\n"
+ " b.hex as buildid, 'F' as sourcetype, f0.name as source0, n.mtime as mtime0, null as source1, null as size, null as mtime1, null as uncompressed_offset\n"
" from " BUILDIDS "_buildids b, " BUILDIDS "_files_v f0, " BUILDIDS "_f_de n\n"
" where b.id = n.buildid and f0.id = n.file and n.debuginfo_p = 1\n"
"union all select\n"
- " b.hex as buildid, n.mtime, 'R' as sourcetype, f0.name as source0, n.mtime as mtime, f1.name as source1\n"
+ " b.hex as buildid, 'R' as sourcetype, f0.name as source0, n.mtime0 as mtime0, f1.name as source1, n.size as size, n.mtime1 as mtime1, n.uncompressed_offset as uncompressed_offset\n"
" from " BUILDIDS "_buildids b, " BUILDIDS "_files_v f0, " BUILDIDS "_files_v f1, " BUILDIDS "_r_de n\n"
" where b.id = n.buildid and f0.id = n.file and f1.id = n.content and n.debuginfo_p = 1\n"
";"
// ... and for E queries
"create view if not exists " BUILDIDS "_query_e as \n"
"select\n"
- " b.hex as buildid, n.mtime, 'F' as sourcetype, f0.name as source0, n.mtime as mtime, null as source1\n"
+ " b.hex as buildid, 'F' as sourcetype, f0.name as source0, n.mtime as mtime0, null as source1, null as size, null as mtime1, null as uncompressed_offset\n"
" from " BUILDIDS "_buildids b, " BUILDIDS "_files_v f0, " BUILDIDS "_f_de n\n"
" where b.id = n.buildid and f0.id = n.file and n.executable_p = 1\n"
"union all select\n"
- " b.hex as buildid, n.mtime, 'R' as sourcetype, f0.name as source0, n.mtime as mtime, f1.name as source1\n"
+ " b.hex as buildid, 'R' as sourcetype, f0.name as source0, n.mtime0 as mtime0, f1.name as source1, n.size as size, n.mtime1 as mtime1, n.uncompressed_offset as uncompressed_offset\n"
" from " BUILDIDS "_buildids b, " BUILDIDS "_files_v f0, " BUILDIDS "_files_v f1, " BUILDIDS "_r_de n\n"
" where b.id = n.buildid and f0.id = n.file and f1.id = n.content and n.executable_p = 1\n"
";"
@@ -324,8 +331,23 @@ static const char DEBUGINFOD_SQLITE_DDL[] =
// data over instead of just dropping it. But that could incur
// doubled storage costs.
//
-// buildids10: split the _files table into _parts
+// buildids11: add size, mtime1, and uncompressed_offset to _r_de, _query_d, and _query_e
"" // <<< we are here
+// buildids10: split the _files table into _parts
+ "DROP VIEW IF EXISTS buildids10_stats;\n"
+ "DROP VIEW IF EXISTS buildids10_query_s;\n"
+ "DROP VIEW IF EXISTS buildids10_query_e;\n"
+ "DROP VIEW IF EXISTS buildids10_query_d;\n"
+ "DROP TABLE IF EXISTS buildids10_r_sdef;\n"
+ "DROP TABLE IF EXISTS buildids10_r_sref;\n"
+ "DROP TABLE IF EXISTS buildids10_r_de;\n"
+ "DROP TABLE IF EXISTS buildids10_f_s;\n"
+ "DROP TABLE IF EXISTS buildids10_f_de;\n"
+ "DROP TABLE IF EXISTS buildids10_file_mtime_scanned;\n"
+ "DROP TABLE IF EXISTS buildids10_buildids;\n"
+ "DROP VIEW IF EXISTS buildids10_files_v;\n"
+ "DROP TABLE IF EXISTS buildids10_files;\n"
+ "DROP TABLE IF EXISTS buildids10_fileparts;\n"
// buildids9: widen the mtime_scanned table
"DROP VIEW IF EXISTS buildids9_stats;\n"
"DROP INDEX IF EXISTS buildids9_r_de_idx;\n"
@@ -1947,6 +1969,140 @@ handle_buildid_f_match (bool internal_req_t,
return r;
}
+
+#ifdef USE_LZMA
+// Neither RPM nor deb files support seeking to a specific file in the package.
+// Instead, to extract a specific file, we normally need to read the archive
+// sequentially until we find the file. This is very slow for files at the end
+// of a large package with lots of files, like kernel debuginfo.
+//
+// However, if the compression format used in the archive supports seeking, we
+// can accelerate this. As of July 2024, xz is the only widely-used format that
+// supports seeking, and usually only in multi-threaded mode. Luckily, the
+// kernel-debuginfo package in Fedora and its downstreams, and the
+// linux-image-*-dbg package in Debian and its downstreams, all happen to use
+// this.
+//
+// The xz format [1] ends with an index of independently compressed blocks in
+// the stream. In RPM and deb files, the xz stream is the last thing in the
+// file, so we assume that the xz Stream Footer is at the end of the package
+// file and do everything relative to that. For each file in the archive, we
+// remember the size and offset of the file data in the uncompressed xz stream,
+// then we use the index to seek to that offset when we need that file.
+//
+// 1: https://xz.tukaani.org/format/xz-file-format.txt
+
+// Return whether an archive supports seeking.
+static bool
+is_seekable_archive (const string& rps, struct archive* a)
+{
+ // Only xz supports seeking.
+ if (archive_filter_code (a, 0) != ARCHIVE_FILTER_XZ)
+ return false;
+
+ int fd = open (rps.c_str(), O_RDONLY);
+ if (fd < 0)
+ return false;
+ defer_dtor<int,int> fd_closer (fd, close);
+
+ // Seek to the xz Stream Footer. We assume that it's the last thing in the
+ // file, which is true for RPM and deb files.
+ off_t footer_pos = -LZMA_STREAM_HEADER_SIZE;
+ if (lseek (fd, footer_pos, SEEK_END) == -1)
+ return false;
+
+ // Decode the Stream Footer.
+ uint8_t footer[LZMA_STREAM_HEADER_SIZE];
+ size_t footer_read = 0;
+ while (footer_read < sizeof (footer))
+ {
+ ssize_t bytes_read = read (fd, footer + footer_read,
+ sizeof (footer) - footer_read);
+ if (bytes_read < 0)
+ {
+ if (errno == EINTR)
+ continue;
+ return false;
+ }
+ if (bytes_read == 0)
+ return false;
+ footer_read += bytes_read;
+ }
+
+ lzma_stream_flags stream_flags;
+ lzma_ret ret = lzma_stream_footer_decode (&stream_flags, footer);
+ if (ret != LZMA_OK)
+ return false;
+
+ // Seek to the xz Index.
+ if (lseek (fd, footer_pos - stream_flags.backward_size, SEEK_END) == -1)
+ return false;
+
+ // Decode the Number of Records in the Index. liblzma doesn't have an API for
+ // this if you don't want to decode the whole Index, so we have to do it
+ // ourselves.
+ //
+ // We need 1 byte for the Index Indicator plus 1-9 bytes for the
+ // variable-length integer Number of Records.
+ uint8_t index[10];
+ size_t index_read = 0;
+ while (index_read == 0) {
+ ssize_t bytes_read = read (fd, index, sizeof (index));
+ if (bytes_read < 0)
+ {
+ if (errno == EINTR)
+ continue;
+ return false;
+ }
+ if (bytes_read == 0)
+ return false;
+ index_read += bytes_read;
+ }
+ // The Index Indicator must be 0.
+ if (index[0] != 0)
+ return false;
+
+ lzma_vli num_records;
+ size_t pos = 0;
+ size_t in_pos = 1;
+ while (true)
+ {
+ if (in_pos >= index_read)
+ {
+ ssize_t bytes_read = read (fd, index, sizeof (index));
+ if (bytes_read < 0)
+ {
+ if (errno == EINTR)
+ continue;
+ return false;
+ }
+ if (bytes_read == 0)
+ return false;
+ index_read = bytes_read;
+ in_pos = 0;
+ }
+ ret = lzma_vli_decode (&num_records, &pos, index, &in_pos, index_read);
+ if (ret == LZMA_STREAM_END)
+ break;
+ else if (ret != LZMA_OK)
+ return false;
+ }
+
+ if (verbose > 3)
+ obatched(clog) << rps << " has " << num_records << " xz Blocks" << endl;
+
+ // The file is only seekable if it has more than one Block.
+ return num_records > 1;
+}
+#else
+static bool
+is_seekable_archive (const string& rps, struct archive* a)
+{
+ return false;
+}
+#endif
+
+
// For security/portability reasons, many distro-package archives have
// a "./" in front of path names; others have nothing, others have
// "/". Canonicalize them all to a single leading "/", with the
@@ -2557,16 +2713,16 @@ handle_buildid (MHD_Connection* conn,
if (atype_code == "D")
{
pp = new sqlite_ps (thisdb, "mhd-query-d",
- "select mtime, sourcetype, source0, source1 from " BUILDIDS "_query_d where buildid = ? "
- "order by mtime desc");
+ "select mtime0, sourcetype, source0, source1 from " BUILDIDS "_query_d where buildid = ? "
+ "order by mtime0 desc");
pp->reset();
pp->bind(1, buildid);
}
else if (atype_code == "E")
{
pp = new sqlite_ps (thisdb, "mhd-query-e",
- "select mtime, sourcetype, source0, source1 from " BUILDIDS "_query_e where buildid = ? "
- "order by mtime desc");
+ "select mtime0, sourcetype, source0, source1 from " BUILDIDS "_query_e where buildid = ? "
+ "order by mtime0 desc");
pp->reset();
pp->bind(1, buildid);
}
@@ -2589,9 +2745,9 @@ handle_buildid (MHD_Connection* conn,
else if (atype_code == "I")
{
pp = new sqlite_ps (thisdb, "mhd-query-i",
- "select mtime, sourcetype, source0, source1, 1 as debug_p from " BUILDIDS "_query_d where buildid = ? "
+ "select mtime0, sourcetype, source0, source1, 1 as debug_p from " BUILDIDS "_query_d where buildid = ? "
"union all "
- "select mtime, sourcetype, source0, source1, 0 as debug_p from " BUILDIDS "_query_e where buildid = ? "
+ "select mtime0, sourcetype, source0, source1, 0 as debug_p from " BUILDIDS "_query_e where buildid = ? "
"order by debug_p desc, mtime desc");
pp->reset();
pp->bind(1, buildid);
@@ -3821,7 +3977,7 @@ archive_classify (const string& rps, string& archive_extension, int64_t archivei
sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_upsert_fileparts, sqlite_ps& ps_upsert_file,
sqlite_ps& ps_lookup_file,
sqlite_ps& ps_upsert_de, sqlite_ps& ps_upsert_sref, sqlite_ps& ps_upsert_sdef,
- time_t mtime,
+ time_t mtime0,
unsigned& fts_executable, unsigned& fts_debuginfo, unsigned& fts_sref, unsigned& fts_sdef,
bool& fts_sref_complete_p)
{
@@ -3875,6 +4031,10 @@ archive_classify (const string& rps, string& archive_extension, int64_t archivei
if (verbose > 3)
obatched(clog) << "libarchive scanning " << rps << " id " << archiveid << endl;
+ bool seekable = is_seekable_archive (rps, a);
+ if (verbose > 2 && seekable)
+ obatched(clog) << rps << " is seekable" << endl;
+
bool any_exceptions = false;
while(1) // parse archive entries
{
@@ -3896,6 +4056,12 @@ archive_classify (const string& rps, string& archive_extension, int64_t archivei
if (verbose > 3)
obatched(clog) << "libarchive checking " << fn << endl;
+ int64_t size = archive_entry_size (e);
+ time_t mtime1 = archive_entry_mtime (e);
+ int64_t uncompressed_offset;
+ if (seekable)
+ uncompressed_offset = archive_filter_bytes (a, 0);
+
// extract this file to a temporary file
char* tmppath = NULL;
rc = asprintf (&tmppath, "%s/debuginfod-classify.XXXXXX", tmpdir.c_str());
@@ -3978,15 +4144,22 @@ archive_classify (const string& rps, string& archive_extension, int64_t archivei
if (executable_p || debuginfo_p)
{
- ps_upsert_de
+ auto& ps =
+ ps_upsert_de
.reset()
.bind(1, buildid)
.bind(2, debuginfo_p ? 1 : 0)
.bind(3, executable_p ? 1 : 0)
.bind(4, archiveid)
- .bind(5, mtime)
+ .bind(5, mtime0)
.bind(6, fileid)
- .step_ok_done();
+ .bind(7, size)
+ .bind(8, mtime1);
+ if (seekable)
+ ps.bind(9, uncompressed_offset);
+ else
+ ps.bind(9);
+ ps.step_ok_done();
}
else // potential source - sdef record
{
@@ -3994,18 +4167,25 @@ archive_classify (const string& rps, string& archive_extension, int64_t archivei
ps_upsert_sdef
.reset()
.bind(1, archiveid)
- .bind(2, mtime)
+ .bind(2, mtime0)
.bind(3, fileid)
.step_ok_done();
}
if ((verbose > 2) && (executable_p || debuginfo_p))
- obatched(clog) << "recorded buildid=" << buildid << " rpm=" << rps << " file=" << fn
- << " mtime=" << mtime << " atype="
+ {
+ obatched ob(clog);
+ auto& o = ob << "recorded buildid=" << buildid << " rpm=" << rps << " file=" << fn
+ << " mtime0=" << mtime0 << " size=" << size
+ << " mtime1=" << mtime1
+ << " atype="
<< (executable_p ? "E" : "")
<< (debuginfo_p ? "D" : "")
- << " sourcefiles=" << sourcefiles.size() << endl;
-
+ << " sourcefiles=" << sourcefiles.size();
+ if (seekable)
+ o << " uncompressed_offset=" << uncompressed_offset;
+ o << endl;
+ }
}
catch (const reportable_exception& e)
{
@@ -4169,8 +4349,8 @@ scan ()
" from " BUILDIDS "_files f, " BUILDIDS "_fileparts p1, " BUILDIDS "_fileparts p2 \n"
" where f.dirname = p1.id and f.basename = p2.id and p1.name = ? and p2.name = ?;\n");
sqlite_ps ps_r_upsert_de (db, "rpm-de-insert",
- "insert or ignore into " BUILDIDS "_r_de (buildid, debuginfo_p, executable_p, file, mtime, content) values ("
- "(select id from " BUILDIDS "_buildids where hex = ?), ?, ?, ?, ?, ?);");
+ "insert or ignore into " BUILDIDS "_r_de (buildid, debuginfo_p, executable_p, file, mtime0, content, size, mtime1, uncompressed_offset) values ("
+ "(select id from " BUILDIDS "_buildids where hex = ?), ?, ?, ?, ?, ?, ?, ?, ?);");
sqlite_ps ps_r_upsert_sref (db, "rpm-sref-insert",
"insert or ignore into " BUILDIDS "_r_sref (buildid, artifactsrc) values ("
"(select id from " BUILDIDS "_buildids where hex = ?), "
@@ -4559,7 +4739,7 @@ void groom()
// as long as we make progress.
sqlite_ps files_del_f_de (db, "nuke f_de", "delete from " BUILDIDS "_f_de where file = ? and mtime = ?");
- sqlite_ps files_del_r_de (db, "nuke r_de", "delete from " BUILDIDS "_r_de where file = ? and mtime = ?");
+ sqlite_ps files_del_r_de (db, "nuke r_de", "delete from " BUILDIDS "_r_de where file = ? and mtime0 = ?");
sqlite_ps files_del_scan (db, "nuke f_m_s", "delete from " BUILDIDS "_file_mtime_scanned "
"where file = ? and mtime = ?");