blob_db_impl.cc 67.4 KB
Newer Older
1

Anirban Rahut's avatar
Anirban Rahut 已提交
2
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
Siying Dong's avatar
Siying Dong 已提交
3
4
5
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
Anirban Rahut's avatar
Anirban Rahut 已提交
6
7
8
9
10
11
12
#ifndef ROCKSDB_LITE

#include "utilities/blob_db/blob_db_impl.h"
#include <algorithm>
#include <cinttypes>
#include <iomanip>
#include <memory>
13
#include <sstream>
Anirban Rahut's avatar
Anirban Rahut 已提交
14

15
#include "db/blob/blob_index.h"
16
#include "db/db_impl/db_impl.h"
Anirban Rahut's avatar
Anirban Rahut 已提交
17
#include "db/write_batch_internal.h"
18
#include "env/composite_env_wrapper.h"
19
20
#include "file/file_util.h"
#include "file/filename.h"
21
#include "file/random_access_file_reader.h"
22
#include "file/sst_file_manager_impl.h"
23
#include "file/writable_file_writer.h"
24
#include "logging/logging.h"
Anirban Rahut's avatar
Anirban Rahut 已提交
25
#include "monitoring/instrumented_mutex.h"
Yi Wu's avatar
Yi Wu 已提交
26
#include "monitoring/statistics.h"
Anirban Rahut's avatar
Anirban Rahut 已提交
27
28
29
30
31
#include "rocksdb/convenience.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/utilities/stackable_db.h"
#include "rocksdb/utilities/transaction.h"
32
33
34
#include "table/block_based/block.h"
#include "table/block_based/block_based_table_builder.h"
#include "table/block_based/block_builder.h"
Anirban Rahut's avatar
Anirban Rahut 已提交
35
#include "table/meta_blocks.h"
36
#include "test_util/sync_point.h"
Siying Dong's avatar
Siying Dong 已提交
37
#include "util/cast_util.h"
Anirban Rahut's avatar
Anirban Rahut 已提交
38
#include "util/crc32c.h"
Yi Wu's avatar
Yi Wu 已提交
39
#include "util/mutexlock.h"
Anirban Rahut's avatar
Anirban Rahut 已提交
40
#include "util/random.h"
Yi Wu's avatar
Yi Wu 已提交
41
#include "util/stop_watch.h"
Anirban Rahut's avatar
Anirban Rahut 已提交
42
#include "util/timer_queue.h"
Yi Wu's avatar
Yi Wu 已提交
43
#include "utilities/blob_db/blob_compaction_filter.h"
44
#include "utilities/blob_db/blob_db_iterator.h"
Yi Wu's avatar
Yi Wu 已提交
45
#include "utilities/blob_db/blob_db_listener.h"
Anirban Rahut's avatar
Anirban Rahut 已提交
46
47
48
49
50

namespace {
int kBlockBasedTableVersionFormat = 2;
}  // end namespace

51
namespace ROCKSDB_NAMESPACE {
Anirban Rahut's avatar
Anirban Rahut 已提交
52
53
namespace blob_db {

54
55
56
57
58
59
60
61
62
bool BlobFileComparator::operator()(
    const std::shared_ptr<BlobFile>& lhs,
    const std::shared_ptr<BlobFile>& rhs) const {
  return lhs->BlobFileNumber() > rhs->BlobFileNumber();
}

bool BlobFileComparatorTTL::operator()(
    const std::shared_ptr<BlobFile>& lhs,
    const std::shared_ptr<BlobFile>& rhs) const {
Yi Wu's avatar
Yi Wu 已提交
63
  assert(lhs->HasTTL() && rhs->HasTTL());
Yi Wu's avatar
Yi Wu 已提交
64
65
66
67
68
69
  if (lhs->expiration_range_.first < rhs->expiration_range_.first) {
    return true;
  }
  if (lhs->expiration_range_.first > rhs->expiration_range_.first) {
    return false;
  }
70
  return lhs->BlobFileNumber() < rhs->BlobFileNumber();
Anirban Rahut's avatar
Anirban Rahut 已提交
71
72
73
74
}

BlobDBImpl::BlobDBImpl(const std::string& dbname,
                       const BlobDBOptions& blob_db_options,
Yi Wu's avatar
Yi Wu 已提交
75
76
77
78
                       const DBOptions& db_options,
                       const ColumnFamilyOptions& cf_options)
    : BlobDB(),
      dbname_(dbname),
Anirban Rahut's avatar
Anirban Rahut 已提交
79
      db_impl_(nullptr),
Yi Wu's avatar
Yi Wu 已提交
80
      env_(db_options.env),
Anirban Rahut's avatar
Anirban Rahut 已提交
81
82
      bdb_options_(blob_db_options),
      db_options_(db_options),
Yi Wu's avatar
Yi Wu 已提交
83
      cf_options_(cf_options),
Anirban Rahut's avatar
Anirban Rahut 已提交
84
      env_options_(db_options),
Yi Wu's avatar
Yi Wu 已提交
85
      statistics_(db_options_.statistics.get()),
Anirban Rahut's avatar
Anirban Rahut 已提交
86
      next_file_number_(1),
87
      flush_sequence_(0),
Yi Wu's avatar
Yi Wu 已提交
88
      closed_(true),
Anirban Rahut's avatar
Anirban Rahut 已提交
89
      open_file_count_(0),
Yi Wu's avatar
Yi Wu 已提交
90
91
92
93
94
      total_blob_size_(0),
      live_sst_size_(0),
      fifo_eviction_seq_(0),
      evict_expiration_up_to_(0),
      debug_level_(0) {
Anirban Rahut's avatar
Anirban Rahut 已提交
95
96
97
  blob_dir_ = (bdb_options_.path_relative)
                  ? dbname + "/" + bdb_options_.blob_dir
                  : bdb_options_.blob_dir;
98
  env_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
Anirban Rahut's avatar
Anirban Rahut 已提交
99
100
}

Yi Wu's avatar
Yi Wu 已提交
101
BlobDBImpl::~BlobDBImpl() {
102
  tqueue_.shutdown();
Yi Wu's avatar
Yi Wu 已提交
103
  // CancelAllBackgroundWork(db_, true);
Yi Wu's avatar
Yi Wu 已提交
104
105
106
107
108
109
110
111
112
  Status s __attribute__((__unused__)) = Close();
  assert(s.ok());
}

Status BlobDBImpl::Close() {
  if (closed_) {
    return Status::OK();
  }
  closed_ = true;
Yi Wu's avatar
Yi Wu 已提交
113

Yi Wu's avatar
Yi Wu 已提交
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  // Close base DB before BlobDBImpl destructs to stop event listener and
  // compaction filter call.
  Status s = db_->Close();
  // delete db_ anyway even if close failed.
  delete db_;
  // Reset pointers to avoid StackableDB delete the pointer again.
  db_ = nullptr;
  db_impl_ = nullptr;
  if (!s.ok()) {
    return s;
  }

  s = SyncBlobFiles();
  return s;
Yi Wu's avatar
Yi Wu 已提交
128
129
130
131
132
133
}

BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }

Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
  assert(handles != nullptr);
Anirban Rahut's avatar
Anirban Rahut 已提交
134
  assert(db_ == nullptr);
135

Yi Wu's avatar
Yi Wu 已提交
136
137
138
  if (blob_dir_.empty()) {
    return Status::NotSupported("No blob directory in options");
  }
139

Yi Wu's avatar
Yi Wu 已提交
140
141
142
143
  if (cf_options_.compaction_filter != nullptr ||
      cf_options_.compaction_filter_factory != nullptr) {
    return Status::NotSupported("Blob DB doesn't support compaction filter.");
  }
144
145
146
147
148
149
150

  if (bdb_options_.garbage_collection_cutoff < 0.0 ||
      bdb_options_.garbage_collection_cutoff > 1.0) {
    return Status::InvalidArgument(
        "Garbage collection cutoff must be in the interval [0.0, 1.0]");
  }

151
152
153
154
155
156
157
158
159
  // Temporarily disable compactions in the base DB during open; save the user
  // defined value beforehand so we can restore it once BlobDB is initialized.
  // Note: this is only needed if garbage collection is enabled.
  const bool disable_auto_compactions = cf_options_.disable_auto_compactions;

  if (bdb_options_.enable_garbage_collection) {
    cf_options_.disable_auto_compactions = true;
  }

Yi Wu's avatar
Yi Wu 已提交
160
  Status s;
Anirban Rahut's avatar
Anirban Rahut 已提交
161

Yi Wu's avatar
Yi Wu 已提交
162
163
164
165
166
167
168
  // Create info log.
  if (db_options_.info_log == nullptr) {
    s = CreateLoggerFromOptions(dbname_, db_options_, &db_options_.info_log);
    if (!s.ok()) {
      return s;
    }
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
169

Yi Wu's avatar
Yi Wu 已提交
170
  ROCKS_LOG_INFO(db_options_.info_log, "Opening BlobDB...");
Anirban Rahut's avatar
Anirban Rahut 已提交
171

Yi Wu's avatar
Yi Wu 已提交
172
173
  // Open blob directory.
  s = env_->CreateDirIfMissing(blob_dir_);
Anirban Rahut's avatar
Anirban Rahut 已提交
174
  if (!s.ok()) {
Yi Wu's avatar
Yi Wu 已提交
175
176
177
    ROCKS_LOG_ERROR(db_options_.info_log,
                    "Failed to create blob_dir %s, status: %s",
                    blob_dir_.c_str(), s.ToString().c_str());
Anirban Rahut's avatar
Anirban Rahut 已提交
178
  }
Yi Wu's avatar
Yi Wu 已提交
179
  s = env_->NewDirectory(blob_dir_, &dir_ent_);
Anirban Rahut's avatar
Anirban Rahut 已提交
180
  if (!s.ok()) {
Yi Wu's avatar
Yi Wu 已提交
181
182
183
184
    ROCKS_LOG_ERROR(db_options_.info_log,
                    "Failed to open blob_dir %s, status: %s", blob_dir_.c_str(),
                    s.ToString().c_str());
    return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
185
186
  }

Yi Wu's avatar
Yi Wu 已提交
187
188
189
190
  // Open blob files.
  s = OpenAllBlobFiles();
  if (!s.ok()) {
    return s;
Yi Wu's avatar
Yi Wu 已提交
191
192
  }

Yi Wu's avatar
Yi Wu 已提交
193
  // Update options
194
195
196
197
198
199
200
201
202
203
204
  if (bdb_options_.enable_garbage_collection) {
    db_options_.listeners.push_back(std::make_shared<BlobDBListenerGC>(this));
    cf_options_.compaction_filter_factory =
        std::make_shared<BlobIndexCompactionFilterFactoryGC>(this, env_,
                                                             statistics_);
  } else {
    db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this));
    cf_options_.compaction_filter_factory =
        std::make_shared<BlobIndexCompactionFilterFactory>(this, env_,
                                                           statistics_);
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
205

Yi Wu's avatar
Yi Wu 已提交
206
207
208
  // Open base db.
  ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_);
  s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_);
Anirban Rahut's avatar
Anirban Rahut 已提交
209
  if (!s.ok()) {
Yi Wu's avatar
Yi Wu 已提交
210
211
212
    return s;
  }
  db_impl_ = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
213

214
215
216
217
218
219
220
221
  // Initialize SST file <-> oldest blob file mapping if garbage collection
  // is enabled.
  if (bdb_options_.enable_garbage_collection) {
    std::vector<LiveFileMetaData> live_files;
    db_->GetLiveFilesMetaData(&live_files);

    InitializeBlobFileToSstMapping(live_files);

222
223
    MarkUnreferencedBlobFilesObsoleteDuringOpen();

224
225
226
227
228
229
230
231
232
233
234
235
    if (!disable_auto_compactions) {
      s = db_->EnableAutoCompaction(*handles);
      if (!s.ok()) {
        ROCKS_LOG_ERROR(
            db_options_.info_log,
            "Failed to enable automatic compactions during open, status: %s",
            s.ToString().c_str());
        return s;
      }
    }
  }

236
237
238
239
240
  // Add trash files in blob dir to file delete scheduler.
  SstFileManagerImpl* sfm = static_cast<SstFileManagerImpl*>(
      db_impl_->immutable_db_options().sst_file_manager.get());
  DeleteScheduler::CleanupDirectory(env_, sfm, blob_dir_);

Yi Wu's avatar
Yi Wu 已提交
241
  UpdateLiveSSTSize();
Yi Wu's avatar
Yi Wu 已提交
242
243
244
245

  // Start background jobs.
  if (!bdb_options_.disable_background_tasks) {
    StartBackgroundTasks();
Anirban Rahut's avatar
Anirban Rahut 已提交
246
247
  }

Yi Wu's avatar
Yi Wu 已提交
248
  ROCKS_LOG_INFO(db_options_.info_log, "BlobDB pointer %p", this);
Yi Wu's avatar
Yi Wu 已提交
249
  bdb_options_.Dump(db_options_.info_log.get());
Yi Wu's avatar
Yi Wu 已提交
250
  closed_ = false;
Anirban Rahut's avatar
Anirban Rahut 已提交
251
252
253
254
255
256
  return s;
}

void BlobDBImpl::StartBackgroundTasks() {
  // store a call to a member function and object
  tqueue_.add(
Yi Wu's avatar
Yi Wu 已提交
257
      kReclaimOpenFilesPeriodMillisecs,
Anirban Rahut's avatar
Anirban Rahut 已提交
258
259
      std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1));
  tqueue_.add(
260
261
      kDeleteObsoleteFilesPeriodMillisecs,
      std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1));
Yi Wu's avatar
Yi Wu 已提交
262
  tqueue_.add(kSanityCheckPeriodMillisecs,
Anirban Rahut's avatar
Anirban Rahut 已提交
263
264
              std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
  tqueue_.add(
265
266
      kEvictExpiredFilesPeriodMillisecs,
      std::bind(&BlobDBImpl::EvictExpiredFiles, this, std::placeholders::_1));
Anirban Rahut's avatar
Anirban Rahut 已提交
267
268
}

Yi Wu's avatar
Yi Wu 已提交
269
270
Status BlobDBImpl::GetAllBlobFiles(std::set<uint64_t>* file_numbers) {
  assert(file_numbers != nullptr);
Anirban Rahut's avatar
Anirban Rahut 已提交
271
  std::vector<std::string> all_files;
Yi Wu's avatar
Yi Wu 已提交
272
273
274
275
276
277
  Status s = env_->GetChildren(blob_dir_, &all_files);
  if (!s.ok()) {
    ROCKS_LOG_ERROR(db_options_.info_log,
                    "Failed to get list of blob files, status: %s",
                    s.ToString().c_str());
    return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
278
279
  }

Yi Wu's avatar
Yi Wu 已提交
280
281
  for (const auto& file_name : all_files) {
    uint64_t file_number;
Anirban Rahut's avatar
Anirban Rahut 已提交
282
    FileType type;
Yi Wu's avatar
Yi Wu 已提交
283
284
285
    bool success = ParseFileName(file_name, &file_number, &type);
    if (success && type == kBlobFile) {
      file_numbers->insert(file_number);
Anirban Rahut's avatar
Anirban Rahut 已提交
286
    } else {
Yi Wu's avatar
Yi Wu 已提交
287
      ROCKS_LOG_WARN(db_options_.info_log,
Yi Wu's avatar
Yi Wu 已提交
288
                     "Skipping file in blob directory: %s", file_name.c_str());
Anirban Rahut's avatar
Anirban Rahut 已提交
289
290
291
    }
  }

Yi Wu's avatar
Yi Wu 已提交
292
  return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
293
294
}

Yi Wu's avatar
Yi Wu 已提交
295
296
297
298
299
300
Status BlobDBImpl::OpenAllBlobFiles() {
  std::set<uint64_t> file_numbers;
  Status s = GetAllBlobFiles(&file_numbers);
  if (!s.ok()) {
    return s;
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
301

Yi Wu's avatar
Yi Wu 已提交
302
303
  if (!file_numbers.empty()) {
    next_file_number_.store(*file_numbers.rbegin() + 1);
Anirban Rahut's avatar
Anirban Rahut 已提交
304
305
  }

306
307
308
  std::ostringstream blob_file_oss;
  std::ostringstream live_imm_oss;
  std::ostringstream obsolete_file_oss;
Anirban Rahut's avatar
Anirban Rahut 已提交
309

Yi Wu's avatar
Yi Wu 已提交
310
311
312
  for (auto& file_number : file_numbers) {
    std::shared_ptr<BlobFile> blob_file = std::make_shared<BlobFile>(
        this, blob_dir_, file_number, db_options_.info_log.get());
313
    blob_file->MarkImmutable(/* sequence */ 0);
Anirban Rahut's avatar
Anirban Rahut 已提交
314

Yi Wu's avatar
Yi Wu 已提交
315
316
317
318
    // Read file header and footer
    Status read_metadata_status = blob_file->ReadMetadata(env_, env_options_);
    if (read_metadata_status.IsCorruption()) {
      // Remove incomplete file.
319
320
      if (!obsolete_files_.empty()) {
        obsolete_file_oss << ", ";
Yi Wu's avatar
Yi Wu 已提交
321
      }
322
323
324
      obsolete_file_oss << file_number;

      ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/);
Yi Wu's avatar
Yi Wu 已提交
325
326
327
      continue;
    } else if (!read_metadata_status.ok()) {
      ROCKS_LOG_ERROR(db_options_.info_log,
328
                      "Unable to read metadata of blob file %" PRIu64
Yi Wu's avatar
Yi Wu 已提交
329
330
331
332
                      ", status: '%s'",
                      file_number, read_metadata_status.ToString().c_str());
      return read_metadata_status;
    }
Anirban Rahut's avatar
Anirban Rahut 已提交
333

Yi Wu's avatar
Yi Wu 已提交
334
335
    total_blob_size_ += blob_file->GetFileSize();

336
337
338
339
340
    if (!blob_files_.empty()) {
      blob_file_oss << ", ";
    }
    blob_file_oss << file_number;

Yi Wu's avatar
Yi Wu 已提交
341
    blob_files_[file_number] = blob_file;
342
343
344
345
346
347
348
349

    if (!blob_file->HasTTL()) {
      if (!live_imm_non_ttl_blob_files_.empty()) {
        live_imm_oss << ", ";
      }
      live_imm_oss << file_number;

      live_imm_non_ttl_blob_files_[file_number] = blob_file;
Anirban Rahut's avatar
Anirban Rahut 已提交
350
351
352
    }
  }

Yi Wu's avatar
Yi Wu 已提交
353
354
  ROCKS_LOG_INFO(db_options_.info_log,
                 "Found %" ROCKSDB_PRIszt " blob files: %s", blob_files_.size(),
355
356
357
358
                 blob_file_oss.str().c_str());
  ROCKS_LOG_INFO(
      db_options_.info_log, "Found %" ROCKSDB_PRIszt " non-TTL blob files: %s",
      live_imm_non_ttl_blob_files_.size(), live_imm_oss.str().c_str());
Yi Wu's avatar
Yi Wu 已提交
359
360
361
  ROCKS_LOG_INFO(db_options_.info_log,
                 "Found %" ROCKSDB_PRIszt
                 " incomplete or corrupted blob files: %s",
362
                 obsolete_files_.size(), obsolete_file_oss.str().c_str());
Yi Wu's avatar
Yi Wu 已提交
363
  return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
364
365
}

366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
template <typename Linker>
void BlobDBImpl::LinkSstToBlobFileImpl(uint64_t sst_file_number,
                                       uint64_t blob_file_number,
                                       Linker linker) {
  assert(bdb_options_.enable_garbage_collection);
  assert(blob_file_number != kInvalidBlobFileNumber);

  auto it = blob_files_.find(blob_file_number);
  if (it == blob_files_.end()) {
    ROCKS_LOG_WARN(db_options_.info_log,
                   "Blob file %" PRIu64
                   " not found while trying to link "
                   "SST file %" PRIu64,
                   blob_file_number, sst_file_number);
    return;
  }

  BlobFile* const blob_file = it->second.get();
  assert(blob_file);

  linker(blob_file, sst_file_number);

  ROCKS_LOG_INFO(db_options_.info_log,
                 "Blob file %" PRIu64 " linked to SST file %" PRIu64,
                 blob_file_number, sst_file_number);
}

void BlobDBImpl::LinkSstToBlobFile(uint64_t sst_file_number,
                                   uint64_t blob_file_number) {
  auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
    WriteLock file_lock(&blob_file->mutex_);
    blob_file->LinkSstFile(sst_file);
  };

  LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
}

void BlobDBImpl::LinkSstToBlobFileNoLock(uint64_t sst_file_number,
                                         uint64_t blob_file_number) {
  auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
    blob_file->LinkSstFile(sst_file);
  };

  LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
}

void BlobDBImpl::UnlinkSstFromBlobFile(uint64_t sst_file_number,
                                       uint64_t blob_file_number) {
  assert(bdb_options_.enable_garbage_collection);
  assert(blob_file_number != kInvalidBlobFileNumber);

  auto it = blob_files_.find(blob_file_number);
  if (it == blob_files_.end()) {
    ROCKS_LOG_WARN(db_options_.info_log,
                   "Blob file %" PRIu64
                   " not found while trying to unlink "
                   "SST file %" PRIu64,
                   blob_file_number, sst_file_number);
    return;
  }

  BlobFile* const blob_file = it->second.get();
  assert(blob_file);

  {
    WriteLock file_lock(&blob_file->mutex_);
    blob_file->UnlinkSstFile(sst_file_number);
  }

  ROCKS_LOG_INFO(db_options_.info_log,
                 "Blob file %" PRIu64 " unlinked from SST file %" PRIu64,
                 blob_file_number, sst_file_number);
}

void BlobDBImpl::InitializeBlobFileToSstMapping(
    const std::vector<LiveFileMetaData>& live_files) {
  assert(bdb_options_.enable_garbage_collection);

  for (const auto& live_file : live_files) {
    const uint64_t sst_file_number = live_file.file_number;
    const uint64_t blob_file_number = live_file.oldest_blob_file_number;

    if (blob_file_number == kInvalidBlobFileNumber) {
      continue;
    }

    LinkSstToBlobFileNoLock(sst_file_number, blob_file_number);
  }
}

void BlobDBImpl::ProcessFlushJobInfo(const FlushJobInfo& info) {
  assert(bdb_options_.enable_garbage_collection);

459
  WriteLock lock(&mutex_);
460

461
  if (info.oldest_blob_file_number != kInvalidBlobFileNumber) {
462
463
    LinkSstToBlobFile(info.file_number, info.oldest_blob_file_number);
  }
464
465
466
467
468

  assert(flush_sequence_ < info.largest_seqno);
  flush_sequence_ = info.largest_seqno;

  MarkUnreferencedBlobFilesObsolete();
469
470
471
472
473
}

void BlobDBImpl::ProcessCompactionJobInfo(const CompactionJobInfo& info) {
  assert(bdb_options_.enable_garbage_collection);

474
475
476
477
  if (!info.status.ok()) {
    return;
  }

478
  // Note: the same SST file may appear in both the input and the output
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
  // file list in case of a trivial move. We walk through the two lists
  // below in a fashion that's similar to merge sort to detect this.

  auto cmp = [](const CompactionFileInfo& lhs, const CompactionFileInfo& rhs) {
    return lhs.file_number < rhs.file_number;
  };

  auto inputs = info.input_file_infos;
  auto iit = inputs.begin();
  const auto iit_end = inputs.end();

  std::sort(iit, iit_end, cmp);

  auto outputs = info.output_file_infos;
  auto oit = outputs.begin();
  const auto oit_end = outputs.end();

  std::sort(oit, oit_end, cmp);
497

498
  WriteLock lock(&mutex_);
499

500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
  while (iit != iit_end && oit != oit_end) {
    const auto& input = *iit;
    const auto& output = *oit;

    if (input.file_number == output.file_number) {
      ++iit;
      ++oit;
    } else if (input.file_number < output.file_number) {
      if (input.oldest_blob_file_number != kInvalidBlobFileNumber) {
        UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
      }

      ++iit;
    } else {
      assert(output.file_number < input.file_number);

      if (output.oldest_blob_file_number != kInvalidBlobFileNumber) {
        LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
      }

      ++oit;
    }
  }

  while (iit != iit_end) {
    const auto& input = *iit;

    if (input.oldest_blob_file_number != kInvalidBlobFileNumber) {
      UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
529
    }
530

531
    ++iit;
532
533
  }

534
535
536
537
538
  while (oit != oit_end) {
    const auto& output = *oit;

    if (output.oldest_blob_file_number != kInvalidBlobFileNumber) {
      LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
539
540
    }

541
    ++oit;
542
  }
543

544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
  MarkUnreferencedBlobFilesObsolete();
}

bool BlobDBImpl::MarkBlobFileObsoleteIfNeeded(
    const std::shared_ptr<BlobFile>& blob_file, SequenceNumber obsolete_seq) {
  assert(blob_file);
  assert(!blob_file->HasTTL());
  assert(blob_file->Immutable());
  assert(bdb_options_.enable_garbage_collection);

  // Note: FIFO eviction could have marked this file obsolete already.
  if (blob_file->Obsolete()) {
    return true;
  }

  // We cannot mark this file (or any higher-numbered files for that matter)
  // obsolete if it is referenced by any memtables or SSTs. We keep track of
  // the SSTs explicitly. To account for memtables, we keep track of the highest
  // sequence number received in flush notifications, and we do not mark the
  // blob file obsolete if there are still unflushed memtables from before
  // the time the blob file was closed.
  if (blob_file->GetImmutableSequence() > flush_sequence_ ||
      !blob_file->GetLinkedSstFiles().empty()) {
    return false;
  }

  ROCKS_LOG_INFO(db_options_.info_log,
                 "Blob file %" PRIu64 " is no longer needed, marking obsolete",
                 blob_file->BlobFileNumber());

  ObsoleteBlobFile(blob_file, obsolete_seq, /* update_size */ true);
  return true;
}

template <class Functor>
void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed) {
  assert(bdb_options_.enable_garbage_collection);

  // Iterate through all live immutable non-TTL blob files, and mark them
  // obsolete assuming no SST files or memtables rely on the blobs in them.
  // Note: we need to stop as soon as we find a blob file that has any
  // linked SSTs (or one potentially referenced by memtables).

587
588
  uint64_t obsoleted_files = 0;

589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
  auto it = live_imm_non_ttl_blob_files_.begin();
  while (it != live_imm_non_ttl_blob_files_.end()) {
    const auto& blob_file = it->second;
    assert(blob_file);
    assert(blob_file->BlobFileNumber() == it->first);
    assert(!blob_file->HasTTL());
    assert(blob_file->Immutable());

    // Small optimization: Obsolete() does an atomic read, so we can do
    // this check without taking a lock on the blob file's mutex.
    if (blob_file->Obsolete()) {
      it = live_imm_non_ttl_blob_files_.erase(it);
      continue;
    }

    if (!mark_if_needed(blob_file)) {
      break;
606
    }
607
608

    it = live_imm_non_ttl_blob_files_.erase(it);
609
610
611
612
613
614
615
616
617

    ++obsoleted_files;
  }

  if (obsoleted_files > 0) {
    ROCKS_LOG_INFO(db_options_.info_log,
                   "%" PRIu64 " blob file(s) marked obsolete by GC",
                   obsoleted_files);
    RecordTick(statistics_, BLOB_DB_GC_NUM_FILES, obsoleted_files);
618
619
620
  }
}

621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
void BlobDBImpl::MarkUnreferencedBlobFilesObsolete() {
  const SequenceNumber obsolete_seq = GetLatestSequenceNumber();

  MarkUnreferencedBlobFilesObsoleteImpl(
      [=](const std::shared_ptr<BlobFile>& blob_file) {
        WriteLock file_lock(&blob_file->mutex_);
        return MarkBlobFileObsoleteIfNeeded(blob_file, obsolete_seq);
      });
}

void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteDuringOpen() {
  MarkUnreferencedBlobFilesObsoleteImpl(
      [=](const std::shared_ptr<BlobFile>& blob_file) {
        return MarkBlobFileObsoleteIfNeeded(blob_file, /* obsolete_seq */ 0);
      });
}

Anirban Rahut's avatar
Anirban Rahut 已提交
638
639
640
641
642
643
void BlobDBImpl::CloseRandomAccessLocked(
    const std::shared_ptr<BlobFile>& bfile) {
  bfile->CloseRandomAccessLocked();
  open_file_count_--;
}

Yi Wu's avatar
Yi Wu 已提交
644
645
646
647
Status BlobDBImpl::GetBlobFileReader(
    const std::shared_ptr<BlobFile>& blob_file,
    std::shared_ptr<RandomAccessFileReader>* reader) {
  assert(reader != nullptr);
Anirban Rahut's avatar
Anirban Rahut 已提交
648
  bool fresh_open = false;
Yi Wu's avatar
Yi Wu 已提交
649
650
651
652
653
654
  Status s = blob_file->GetReader(env_, env_options_, reader, &fresh_open);
  if (s.ok() && fresh_open) {
    assert(*reader != nullptr);
    open_file_count_++;
  }
  return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
655
656
}

657
658
659
660
661
std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(
    bool has_ttl, const ExpirationRange& expiration_range,
    const std::string& reason) {
  assert(has_ttl == (expiration_range.first || expiration_range.second));

Anirban Rahut's avatar
Anirban Rahut 已提交
662
  uint64_t file_num = next_file_number_++;
663
664
665
666
667
668
669

  const uint32_t column_family_id =
      static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
  auto blob_file = std::make_shared<BlobFile>(
      this, blob_dir_, file_num, db_options_.info_log.get(), column_family_id,
      bdb_options_.compression, has_ttl, expiration_range);

Yi Wu's avatar
Yi Wu 已提交
670
  ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'",
671
                  blob_file->PathName().c_str(), reason.c_str());
Anirban Rahut's avatar
Anirban Rahut 已提交
672
  LogFlush(db_options_.info_log);
673
674

  return blob_file;
Anirban Rahut's avatar
Anirban Rahut 已提交
675
676
}

677
678
679
680
681
682
683
684
685
686
687
void BlobDBImpl::RegisterBlobFile(std::shared_ptr<BlobFile> blob_file) {
  const uint64_t blob_file_number = blob_file->BlobFileNumber();

  auto it = blob_files_.lower_bound(blob_file_number);
  assert(it == blob_files_.end() || it->first != blob_file_number);

  blob_files_.insert(it,
                     std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
                         blob_file_number, std::move(blob_file)));
}

Anirban Rahut's avatar
Anirban Rahut 已提交
688
689
690
691
Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
  std::string fpath(bfile->PathName());
  std::unique_ptr<WritableFile> wfile;

Yi Wu's avatar
Yi Wu 已提交
692
  Status s = env_->ReopenWritableFile(fpath, &wfile, env_options_);
Anirban Rahut's avatar
Anirban Rahut 已提交
693
  if (!s.ok()) {
Yi Wu's avatar
Yi Wu 已提交
694
695
696
697
    ROCKS_LOG_ERROR(db_options_.info_log,
                    "Failed to open blob file for write: %s status: '%s'"
                    " exists: '%s'",
                    fpath.c_str(), s.ToString().c_str(),
Yi Wu's avatar
Yi Wu 已提交
698
                    env_->FileExists(fpath).ToString().c_str());
Anirban Rahut's avatar
Anirban Rahut 已提交
699
700
701
702
    return s;
  }

  std::unique_ptr<WritableFileWriter> fwriter;
703
704
  fwriter.reset(new WritableFileWriter(
      NewLegacyWritableFileWrapper(std::move(wfile)), fpath, env_options_));
Anirban Rahut's avatar
Anirban Rahut 已提交
705
706
707

  uint64_t boffset = bfile->GetFileSize();
  if (debug_level_ >= 2 && boffset) {
708
709
710
    ROCKS_LOG_DEBUG(db_options_.info_log,
                    "Open blob file: %s with offset: %" PRIu64, fpath.c_str(),
                    boffset);
Anirban Rahut's avatar
Anirban Rahut 已提交
711
712
713
  }

  Writer::ElemType et = Writer::kEtNone;
Yi Wu's avatar
Yi Wu 已提交
714
  if (bfile->file_size_ == BlobLogHeader::kSize) {
Anirban Rahut's avatar
Anirban Rahut 已提交
715
    et = Writer::kEtFileHdr;
Yi Wu's avatar
Yi Wu 已提交
716
  } else if (bfile->file_size_ > BlobLogHeader::kSize) {
717
718
    et = Writer::kEtRecord;
  } else if (bfile->file_size_) {
Yi Wu's avatar
Yi Wu 已提交
719
    ROCKS_LOG_WARN(db_options_.info_log,
720
721
                   "Open blob file: %s with wrong size: %" PRIu64,
                   fpath.c_str(), boffset);
Anirban Rahut's avatar
Anirban Rahut 已提交
722
723
724
725
    return Status::Corruption("Invalid blob file size");
  }

  bfile->log_writer_ = std::make_shared<Writer>(
Yi Wu's avatar
Yi Wu 已提交
726
727
      std::move(fwriter), env_, statistics_, bfile->file_number_,
      bdb_options_.bytes_per_sync, db_options_.use_fsync, boffset);
Anirban Rahut's avatar
Anirban Rahut 已提交
728
729
730
731
732
733
  bfile->log_writer_->last_elem_type_ = et;

  return s;
}

std::shared_ptr<BlobFile> BlobDBImpl::FindBlobFileLocked(
734
    uint64_t expiration) const {
Yi Wu's avatar
Yi Wu 已提交
735
736
737
  if (open_ttl_files_.empty()) {
    return nullptr;
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
738
739

  std::shared_ptr<BlobFile> tmp = std::make_shared<BlobFile>();
Yi Wu's avatar
Yi Wu 已提交
740
  tmp->SetHasTTL(true);
Yi Wu's avatar
Yi Wu 已提交
741
  tmp->expiration_range_ = std::make_pair(expiration, 0);
Yi Wu's avatar
Yi Wu 已提交
742
  tmp->file_number_ = std::numeric_limits<uint64_t>::max();
Anirban Rahut's avatar
Anirban Rahut 已提交
743

Yi Wu's avatar
Yi Wu 已提交
744
745
746
  auto citr = open_ttl_files_.equal_range(tmp);
  if (citr.first == open_ttl_files_.end()) {
    assert(citr.second == open_ttl_files_.end());
Anirban Rahut's avatar
Anirban Rahut 已提交
747

Yi Wu's avatar
Yi Wu 已提交
748
    std::shared_ptr<BlobFile> check = *(open_ttl_files_.rbegin());
Yi Wu's avatar
Yi Wu 已提交
749
    return (check->expiration_range_.second <= expiration) ? nullptr : check;
Anirban Rahut's avatar
Anirban Rahut 已提交
750
751
  }

Yi Wu's avatar
Yi Wu 已提交
752
753
754
  if (citr.first != citr.second) {
    return *(citr.first);
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
755
756

  auto finditr = citr.second;
Yi Wu's avatar
Yi Wu 已提交
757
758
759
  if (finditr != open_ttl_files_.begin()) {
    --finditr;
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
760

Yi Wu's avatar
Yi Wu 已提交
761
  bool b2 = (*finditr)->expiration_range_.second <= expiration;
Yi Wu's avatar
Yi Wu 已提交
762
  bool b1 = (*finditr)->expiration_range_.first > expiration;
Anirban Rahut's avatar
Anirban Rahut 已提交
763
764
765
766

  return (b1 || b2) ? nullptr : (*finditr);
}

Yi Wu's avatar
Yi Wu 已提交
767
768
769
770
771
772
773
774
775
776
777
778
779
Status BlobDBImpl::CheckOrCreateWriterLocked(
    const std::shared_ptr<BlobFile>& blob_file,
    std::shared_ptr<Writer>* writer) {
  assert(writer != nullptr);
  *writer = blob_file->GetWriter();
  if (*writer != nullptr) {
    return Status::OK();
  }
  Status s = CreateWriterLocked(blob_file);
  if (s.ok()) {
    *writer = blob_file->GetWriter();
  }
  return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
780
781
}

782
783
784
785
786
787
788
Status BlobDBImpl::CreateBlobFileAndWriter(
    bool has_ttl, const ExpirationRange& expiration_range,
    const std::string& reason, std::shared_ptr<BlobFile>* blob_file,
    std::shared_ptr<Writer>* writer) {
  assert(has_ttl == (expiration_range.first || expiration_range.second));
  assert(blob_file);
  assert(writer);
Anirban Rahut's avatar
Anirban Rahut 已提交
789

790
791
  *blob_file = NewBlobFile(has_ttl, expiration_range, reason);
  assert(*blob_file);
Anirban Rahut's avatar
Anirban Rahut 已提交
792
793

  // file not visible, hence no lock
794
  Status s = CheckOrCreateWriterLocked(*blob_file, writer);
Yi Wu's avatar
Yi Wu 已提交
795
  if (!s.ok()) {
Yi Wu's avatar
Yi Wu 已提交
796
    ROCKS_LOG_ERROR(db_options_.info_log,
797
                    "Failed to get writer for blob file: %s, error: %s",
Yi Wu's avatar
Yi Wu 已提交
798
799
                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
    return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
800
801
  }

802
  assert(*writer);
Anirban Rahut's avatar
Anirban Rahut 已提交
803

804
  s = (*writer)->WriteHeader((*blob_file)->header_);
Anirban Rahut's avatar
Anirban Rahut 已提交
805
  if (!s.ok()) {
Yi Wu's avatar
Yi Wu 已提交
806
807
808
    ROCKS_LOG_ERROR(db_options_.info_log,
                    "Failed to write header to new blob file: %s"
                    " status: '%s'",
Yi Wu's avatar
Yi Wu 已提交
809
810
                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
    return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
811
812
  }

813
  (*blob_file)->SetFileSize(BlobLogHeader::kSize);
Yi Wu's avatar
Yi Wu 已提交
814
  total_blob_size_ += BlobLogHeader::kSize;
815

Yi Wu's avatar
Yi Wu 已提交
816
  return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
817
818
}

819
820
821
Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
  assert(blob_file);

Anirban Rahut's avatar
Anirban Rahut 已提交
822
823
824
  {
    ReadLock rl(&mutex_);

825
826
827
828
829
    if (open_non_ttl_file_) {
      assert(!open_non_ttl_file_->Immutable());
      *blob_file = open_non_ttl_file_;
      return Status::OK();
    }
Anirban Rahut's avatar
Anirban Rahut 已提交
830
831
  }

832
833
  // Check again
  WriteLock wl(&mutex_);
Anirban Rahut's avatar
Anirban Rahut 已提交
834

835
836
837
838
839
  if (open_non_ttl_file_) {
    assert(!open_non_ttl_file_->Immutable());
    *blob_file = open_non_ttl_file_;
    return Status::OK();
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
840

Yi Wu's avatar
Yi Wu 已提交
841
  std::shared_ptr<Writer> writer;
842
843
844
  const Status s = CreateBlobFileAndWriter(
      /* has_ttl */ false, ExpirationRange(),
      /* reason */ "SelectBlobFile", blob_file, &writer);
Yi Wu's avatar
Yi Wu 已提交
845
846
  if (!s.ok()) {
    return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
847
848
  }

849
  RegisterBlobFile(*blob_file);
850
  open_non_ttl_file_ = *blob_file;
Anirban Rahut's avatar
Anirban Rahut 已提交
851

852
853
  return s;
}
Anirban Rahut's avatar
Anirban Rahut 已提交
854

855
856
857
858
859
860
861
862
863
864
865
Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration,
                                     std::shared_ptr<BlobFile>* blob_file) {
  assert(blob_file);
  assert(expiration != kNoExpiration);

  {
    ReadLock rl(&mutex_);

    *blob_file = FindBlobFileLocked(expiration);
    if (*blob_file != nullptr) {
      assert(!(*blob_file)->Immutable());
Yi Wu's avatar
Yi Wu 已提交
866
867
      return Status::OK();
    }
Anirban Rahut's avatar
Anirban Rahut 已提交
868
869
  }

870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
  // Check again
  WriteLock wl(&mutex_);

  *blob_file = FindBlobFileLocked(expiration);
  if (*blob_file != nullptr) {
    assert(!(*blob_file)->Immutable());
    return Status::OK();
  }

  const uint64_t exp_low =
      (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
  const uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
  const ExpirationRange expiration_range(exp_low, exp_high);

  std::ostringstream oss;
  oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')';

  std::shared_ptr<Writer> writer;
  const Status s =
      CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range,
                              /* reason */ oss.str(), blob_file, &writer);
Anirban Rahut's avatar
Anirban Rahut 已提交
891
  if (!s.ok()) {
Yi Wu's avatar
Yi Wu 已提交
892
    return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
893
894
  }

895
  RegisterBlobFile(*blob_file);
Yi Wu's avatar
Yi Wu 已提交
896
  open_ttl_files_.insert(*blob_file);
Anirban Rahut's avatar
Anirban Rahut 已提交
897

Yi Wu's avatar
Yi Wu 已提交
898
  return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
899
900
}

Yi Wu's avatar
Yi Wu 已提交
901
902
903
904
905
906
class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
 private:
  const WriteOptions& options_;
  BlobDBImpl* blob_db_impl_;
  uint32_t default_cf_id_;
  WriteBatch batch_;
Anirban Rahut's avatar
Anirban Rahut 已提交
907

Yi Wu's avatar
Yi Wu 已提交
908
909
 public:
  BlobInserter(const WriteOptions& options, BlobDBImpl* blob_db_impl,
910
               uint32_t default_cf_id)
Yi Wu's avatar
Yi Wu 已提交
911
912
      : options_(options),
        blob_db_impl_(blob_db_impl),
913
        default_cf_id_(default_cf_id) {}
Yi Wu's avatar
Yi Wu 已提交
914
915
916

  WriteBatch* batch() { return &batch_; }

917
918
  Status PutCF(uint32_t column_family_id, const Slice& key,
               const Slice& value) override {
Yi Wu's avatar
Yi Wu 已提交
919
920
921
    if (column_family_id != default_cf_id_) {
      return Status::NotSupported(
          "Blob DB doesn't support non-default column family.");
Anirban Rahut's avatar
Anirban Rahut 已提交
922
    }
923
924
    Status s = blob_db_impl_->PutBlobValue(options_, key, value, kNoExpiration,
                                           &batch_);
Yi Wu's avatar
Yi Wu 已提交
925
926
    return s;
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
927

928
  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
Yi Wu's avatar
Yi Wu 已提交
929
930
931
    if (column_family_id != default_cf_id_) {
      return Status::NotSupported(
          "Blob DB doesn't support non-default column family.");
Anirban Rahut's avatar
Anirban Rahut 已提交
932
    }
Yi Wu's avatar
Yi Wu 已提交
933
934
935
    Status s = WriteBatchInternal::Delete(&batch_, column_family_id, key);
    return s;
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
936

Yi Wu's avatar
Yi Wu 已提交
937
938
939
940
941
  virtual Status DeleteRange(uint32_t column_family_id, const Slice& begin_key,
                             const Slice& end_key) {
    if (column_family_id != default_cf_id_) {
      return Status::NotSupported(
          "Blob DB doesn't support non-default column family.");
942
    }
Yi Wu's avatar
Yi Wu 已提交
943
944
945
946
    Status s = WriteBatchInternal::DeleteRange(&batch_, column_family_id,
                                               begin_key, end_key);
    return s;
  }
947

948
949
  Status SingleDeleteCF(uint32_t /*column_family_id*/,
                        const Slice& /*key*/) override {
Yi Wu's avatar
Yi Wu 已提交
950
951
    return Status::NotSupported("Not supported operation in blob db.");
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
952

953
954
  Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
                 const Slice& /*value*/) override {
Yi Wu's avatar
Yi Wu 已提交
955
956
    return Status::NotSupported("Not supported operation in blob db.");
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
957

958
  void LogData(const Slice& blob) override { batch_.PutLogData(blob); }
Yi Wu's avatar
Yi Wu 已提交
959
};
Anirban Rahut's avatar
Anirban Rahut 已提交
960

Yi Wu's avatar
Yi Wu 已提交
961
Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
Yi Wu's avatar
Yi Wu 已提交
962
963
  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
  RecordTick(statistics_, BLOB_DB_NUM_WRITE);
Yi Wu's avatar
Yi Wu 已提交
964
965
  uint32_t default_cf_id =
      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
966
  Status s;
967
  BlobInserter blob_inserter(options, this, default_cf_id);
968
969
970
971
972
973
974
  {
    // Release write_mutex_ before DB write to avoid race condition with
    // flush begin listener, which also require write_mutex_ to sync
    // blob files.
    MutexLock l(&write_mutex_);
    s = updates->Iterate(&blob_inserter);
  }
975
976
  if (!s.ok()) {
    return s;
Anirban Rahut's avatar
Anirban Rahut 已提交
977
  }
978
  return db_->Write(options, blob_inserter.batch());
Anirban Rahut's avatar
Anirban Rahut 已提交
979
980
}

Yi Wu's avatar
Yi Wu 已提交
981
982
Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key,
                       const Slice& value) {
983
  return PutUntil(options, key, value, kNoExpiration);
Yi Wu's avatar
Yi Wu 已提交
984
985
}

Anirban Rahut's avatar
Anirban Rahut 已提交
986
987
Status BlobDBImpl::PutWithTTL(const WriteOptions& options,
                              const Slice& key, const Slice& value,
988
989
                              uint64_t ttl) {
  uint64_t now = EpochNow();
Yi Wu's avatar
Yi Wu 已提交
990
991
  uint64_t expiration = kNoExpiration - now > ttl ? now + ttl : kNoExpiration;
  return PutUntil(options, key, value, expiration);
Yi Wu's avatar
Yi Wu 已提交
992
993
}

994
Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
Yi Wu's avatar
Yi Wu 已提交
995
                            const Slice& value, uint64_t expiration) {
Yi Wu's avatar
Yi Wu 已提交
996
997
  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
  RecordTick(statistics_, BLOB_DB_NUM_PUT);
998
  Status s;
Yi Wu's avatar
Yi Wu 已提交
999
  WriteBatch batch;
1000
1001
1002
1003
1004
  {
    // Release write_mutex_ before DB write to avoid race condition with
    // flush begin listener, which also require write_mutex_ to sync
    // blob files.
    MutexLock l(&write_mutex_);
1005
    s = PutBlobValue(options, key, value, expiration, &batch);
1006
  }
Yi Wu's avatar
Yi Wu 已提交
1007
1008
  if (s.ok()) {
    s = db_->Write(options, &batch);
Anirban Rahut's avatar
Anirban Rahut 已提交
1009
  }
Yi Wu's avatar
Yi Wu 已提交
1010
1011
  return s;
}
Anirban Rahut's avatar
Anirban Rahut 已提交
1012

Andrew Kryczka's avatar
Andrew Kryczka 已提交
1013
1014
1015
Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/,
                                const Slice& key, const Slice& value,
                                uint64_t expiration, WriteBatch* batch) {
Yi Wu's avatar
Yi Wu 已提交
1016
  write_mutex_.AssertHeld();
Yi Wu's avatar
Yi Wu 已提交
1017
1018
  Status s;
  std::string index_entry;
1019
1020
  uint32_t column_family_id =
      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
Yi Wu's avatar
Yi Wu 已提交
1021
1022
1023
1024
  if (value.size() < bdb_options_.min_blob_size) {
    if (expiration == kNoExpiration) {
      // Put as normal value
      s = batch->Put(key, value);
Yi Wu's avatar
Yi Wu 已提交
1025
      RecordTick(statistics_, BLOB_DB_WRITE_INLINED);
Yi Wu's avatar
Yi Wu 已提交
1026
1027
1028
1029
1030
    } else {
      // Inlined with TTL
      BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
                                           index_entry);
Yi Wu's avatar
Yi Wu 已提交
1031
      RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
Yi Wu's avatar
Yi Wu 已提交
1032
1033
1034
1035
    }
  } else {
    std::string compression_output;
    Slice value_compressed = GetCompressedSlice(value, &compression_output);
Anirban Rahut's avatar
Anirban Rahut 已提交
1036

Yi Wu's avatar
Yi Wu 已提交
1037
    std::string headerbuf;
Yi Wu's avatar
Yi Wu 已提交
1038
    Writer::ConstructBlobHeader(&headerbuf, key, value_compressed, expiration);
1039

Yi Wu's avatar
Yi Wu 已提交
1040
1041
1042
1043
1044
1045
1046
1047
1048
    // Check DB size limit before selecting blob file to
    // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be
    // done before calling SelectBlobFile().
    s = CheckSizeAndEvictBlobFiles(headerbuf.size() + key.size() +
                                   value_compressed.size());
    if (!s.ok()) {
      return s;
    }

Yi Wu's avatar
Yi Wu 已提交
1049
1050
1051
    std::shared_ptr<BlobFile> blob_file;
    if (expiration != kNoExpiration) {
      s = SelectBlobFileTTL(expiration, &blob_file);
Yi Wu's avatar
Yi Wu 已提交
1052
    } else {
Yi Wu's avatar
Yi Wu 已提交
1053
1054
1055
1056
      s = SelectBlobFile(&blob_file);
    }
    if (s.ok()) {
      assert(blob_file != nullptr);
1057
      assert(blob_file->GetCompressionType() == bdb_options_.compression);
Yi Wu's avatar
Yi Wu 已提交
1058
1059
      s = AppendBlob(blob_file, headerbuf, key, value_compressed, expiration,
                     &index_entry);
Yi Wu's avatar
Yi Wu 已提交
1060
    }
Yi Wu's avatar
Yi Wu 已提交
1061
1062
    if (s.ok()) {
      if (expiration != kNoExpiration) {
Levi Tamasi's avatar
Levi Tamasi 已提交
1063
        WriteLock file_lock(&blob_file->mutex_);
Yi Wu's avatar
Yi Wu 已提交
1064
        blob_file->ExtendExpirationRange(expiration);
Yi Wu's avatar
Yi Wu 已提交
1065
      }
Yi Wu's avatar
Yi Wu 已提交
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
      s = CloseBlobFileIfNeeded(blob_file);
    }
    if (s.ok()) {
      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
                                           index_entry);
    }
    if (s.ok()) {
      if (expiration == kNoExpiration) {
        RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
      } else {
        RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
Yi Wu's avatar
Yi Wu 已提交
1077
1078
      }
    } else {
1079
1080
1081
1082
1083
1084
      ROCKS_LOG_ERROR(
          db_options_.info_log,
          "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt
          " status: '%s' blob_file: '%s'",
          blob_file->PathName().c_str(), key.ToString().c_str(), value.size(),
          s.ToString().c_str(), blob_file->DumpState().c_str());
Yi Wu's avatar
Yi Wu 已提交
1085
    }
Yi Wu's avatar
Yi Wu 已提交
1086
  }
Anirban Rahut's avatar
Anirban Rahut 已提交
1087

Yi Wu's avatar
Yi Wu 已提交
1088
1089
  RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN);
  RecordTick(statistics_, BLOB_DB_BYTES_WRITTEN, key.size() + value.size());
Siying Dong's avatar
Siying Dong 已提交
1090
1091
  RecordInHistogram(statistics_, BLOB_DB_KEY_SIZE, key.size());
  RecordInHistogram(statistics_, BLOB_DB_VALUE_SIZE, value.size());
Yi Wu's avatar
Yi Wu 已提交
1092

Anirban Rahut's avatar
Anirban Rahut 已提交
1093
1094
1095
  return s;
}

Yi Wu's avatar
Yi Wu 已提交
1096
1097
1098
1099
1100
Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
                                     std::string* compression_output) const {
  if (bdb_options_.compression == kNoCompression) {
    return raw;
  }
Yi Wu's avatar
Yi Wu 已提交
1101
  StopWatch compression_sw(env_, statistics_, BLOB_DB_COMPRESSION_MICROS);
1102
1103
1104
  CompressionType type = bdb_options_.compression;
  CompressionOptions opts;
  CompressionContext context(type);
1105
1106
1107
1108
  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type,
                       0 /* sample_for_compression */);
  CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat, false,
                compression_output, nullptr, nullptr);
Yi Wu's avatar
Yi Wu 已提交
1109
1110
1111
  return *compression_output;
}

1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
Status BlobDBImpl::CompactFiles(
    const CompactionOptions& compact_options,
    const std::vector<std::string>& input_file_names, const int output_level,
    const int output_path_id, std::vector<std::string>* const output_file_names,
    CompactionJobInfo* compaction_job_info) {
  // Note: we need CompactionJobInfo to be able to track updates to the
  // blob file <-> SST mappings, so we provide one if the user hasn't,
  // assuming that GC is enabled.
  CompactionJobInfo info{};
  if (bdb_options_.enable_garbage_collection && !compaction_job_info) {
    compaction_job_info = &info;
  }

  const Status s =
      db_->CompactFiles(compact_options, input_file_names, output_level,
                        output_path_id, output_file_names, compaction_job_info);
  if (!s.ok()) {
    return s;
  }

  if (bdb_options_.enable_garbage_collection) {
    assert(compaction_job_info);
    ProcessCompactionJobInfo(*compaction_job_info);
  }

  return s;
}

1140
1141
1142
void BlobDBImpl::GetCompactionContextCommon(
    BlobCompactionContext* context) const {
  assert(context);
Yi Wu's avatar
Yi Wu 已提交
1143
1144
1145
1146
1147

  context->next_file_number = next_file_number_.load();
  context->current_blob_files.clear();
  for (auto& p : blob_files_) {
    context->current_blob_files.insert(p.first);
1148
  }
Yi Wu's avatar
Yi Wu 已提交
1149
1150
  context->fifo_eviction_seq = fifo_eviction_seq_;
  context->evict_expiration_up_to = evict_expiration_up_to_;
1151
1152
}

1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) {
  assert(context);

  ReadLock l(&mutex_);
  GetCompactionContextCommon(context);
}

void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context,
                                      BlobCompactionContextGC* context_gc) {
  assert(context);
  assert(context_gc);

  ReadLock l(&mutex_);
  GetCompactionContextCommon(context);

  context_gc->blob_db_impl = this;

  if (!live_imm_non_ttl_blob_files_.empty()) {
    auto it = live_imm_non_ttl_blob_files_.begin();
    std::advance(it, bdb_options_.garbage_collection_cutoff *
                         live_imm_non_ttl_blob_files_.size());
    context_gc->cutoff_file_number = it != live_imm_non_ttl_blob_files_.end()
                                         ? it->first
                                         : std::numeric_limits<uint64_t>::max();
  }
}

Yi Wu's avatar
Yi Wu 已提交
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
void BlobDBImpl::UpdateLiveSSTSize() {
  uint64_t live_sst_size = 0;
  bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
  if (ok) {
    live_sst_size_.store(live_sst_size);
    ROCKS_LOG_INFO(db_options_.info_log,
                   "Updated total SST file size: %" PRIu64 " bytes.",
                   live_sst_size);
  } else {
    ROCKS_LOG_ERROR(
        db_options_.info_log,
        "Failed to update total SST file size after flush or compaction.");
1192
  }
Yi Wu's avatar
Yi Wu 已提交
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
  {
    // Trigger FIFO eviction if needed.
    MutexLock l(&write_mutex_);
    Status s = CheckSizeAndEvictBlobFiles(0, true /*force*/);
    if (s.IsNoSpace()) {
      ROCKS_LOG_WARN(db_options_.info_log,
                     "DB grow out-of-space after SST size updated. Current live"
                     " SST size: %" PRIu64