Skip to content

Commit 6facd76

Browse files
author
Hernan Gelaf-Romer
committed
HBASE-29744: Data loss scenario for WAL files belonging to RS added between backups
1 parent 9ae1f08 commit 6facd76

File tree

2 files changed

+146
-49
lines changed

2 files changed

+146
-49
lines changed

hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/master/BackupLogCleaner.java

Lines changed: 31 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@
1818
package org.apache.hadoop.hbase.backup.master;
1919

2020
import java.io.IOException;
21+
import java.time.Duration;
2122
import java.util.ArrayList;
2223
import java.util.Collections;
2324
import java.util.HashMap;
2425
import java.util.List;
2526
import java.util.Map;
27+
import java.util.Set;
2628
import java.util.stream.Collectors;
2729
import org.apache.hadoop.conf.Configuration;
2830
import org.apache.hadoop.fs.FileStatus;
@@ -32,7 +34,7 @@
3234
import org.apache.hadoop.hbase.backup.BackupInfo;
3335
import org.apache.hadoop.hbase.backup.BackupRestoreConstants;
3436
import org.apache.hadoop.hbase.backup.impl.BackupManager;
35-
import org.apache.hadoop.hbase.backup.util.BackupUtils;
37+
import org.apache.hadoop.hbase.backup.impl.BackupSystemTable;
3638
import org.apache.hadoop.hbase.client.Connection;
3739
import org.apache.hadoop.hbase.client.ConnectionFactory;
3840
import org.apache.hadoop.hbase.master.HMaster;
@@ -55,6 +57,10 @@
5557
*/
5658
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
5759
public class BackupLogCleaner extends BaseLogCleanerDelegate {
60+
// Adds a buffer to give some allowance when cleaning up WAL files
61+
public static final String TS_BUFFER_KEY = "hbase.backup.log.cleaner.timestamp.buffer.ms";
62+
private static final long TS_BUFFER_DEFAULT = Duration.ofHours(1).toMillis();
63+
5864
private static final Logger LOG = LoggerFactory.getLogger(BackupLogCleaner.class);
5965

6066
private boolean stopped = false;
@@ -153,19 +159,34 @@ public Iterable<FileStatus> getDeletableFiles(Iterable<FileStatus> files) {
153159
return files;
154160
}
155161

156-
Map<Address, Long> serverToPreservationBoundaryTs;
162+
long oldestStartCode = Long.MAX_VALUE;
157163
try {
158-
try (BackupManager backupManager = new BackupManager(conn, getConf())) {
159-
serverToPreservationBoundaryTs =
160-
serverToPreservationBoundaryTs(backupManager.getBackupHistory(true));
164+
try (BackupSystemTable sysTable = new BackupSystemTable(conn)) {
165+
Set<String> roots = sysTable.getBackupHistory(true).stream()
166+
.map(BackupInfo::getBackupRootDir).collect(Collectors.toSet());
167+
if (roots.isEmpty()) {
168+
LOG.info("No backups found, can delete all files");
169+
return files;
170+
}
171+
172+
for (String root : roots) {
173+
long startCode = Long.parseLong(sysTable.readBackupStartCode(root));
174+
if (startCode < oldestStartCode) {
175+
oldestStartCode = startCode;
176+
}
177+
}
161178
}
162179
} catch (IOException ex) {
163180
LOG.error("Failed to analyse backup history with exception: {}. Retaining all logs",
164181
ex.getMessage(), ex);
165182
return Collections.emptyList();
166183
}
184+
185+
oldestStartCode -= getConf().getLong(TS_BUFFER_KEY, TS_BUFFER_DEFAULT);
186+
LOG.info("Allowing file deletes for any wal file older than {}", oldestStartCode);
187+
167188
for (FileStatus file : files) {
168-
if (canDeleteFile(serverToPreservationBoundaryTs, file.getPath())) {
189+
if (canDeleteFile(oldestStartCode, file.getPath())) {
169190
filteredFiles.add(file);
170191
}
171192
}
@@ -200,44 +221,16 @@ public boolean isStopped() {
200221
return this.stopped;
201222
}
202223

203-
protected static boolean canDeleteFile(Map<Address, Long> addressToBoundaryTs, Path path) {
224+
protected static boolean canDeleteFile(long oldestStartCode, Path path) {
204225
if (isHMasterWAL(path)) {
205226
return true;
206227
}
207228

208229
try {
209-
String hostname = BackupUtils.parseHostNameFromLogFile(path);
210-
if (hostname == null) {
211-
LOG.warn(
212-
"Cannot parse hostname from RegionServer WAL file: {}. Ignoring cleanup of this log",
213-
path);
214-
return false;
215-
}
216-
Address walServerAddress = Address.fromString(hostname);
217230
long walTimestamp = AbstractFSWALProvider.getTimestamp(path.getName());
218-
219-
if (!addressToBoundaryTs.containsKey(walServerAddress)) {
220-
if (LOG.isDebugEnabled()) {
221-
LOG.debug("No cleanup WAL time-boundary found for server: {}. Ok to delete file: {}",
222-
walServerAddress.getHostName(), path);
223-
}
224-
return true;
225-
}
226-
227-
Long backupBoundary = addressToBoundaryTs.get(walServerAddress);
228-
if (backupBoundary >= walTimestamp) {
229-
if (LOG.isDebugEnabled()) {
230-
LOG.debug(
231-
"WAL cleanup time-boundary found for server {}: {}. Ok to delete older file: {}",
232-
walServerAddress.getHostName(), backupBoundary, path);
233-
}
231+
if (walTimestamp <= oldestStartCode) {
234232
return true;
235233
}
236-
237-
if (LOG.isDebugEnabled()) {
238-
LOG.debug("WAL cleanup time-boundary found for server {}: {}. Keeping younger file: {}",
239-
walServerAddress.getHostName(), backupBoundary, path);
240-
}
241234
} catch (Exception ex) {
242235
LOG.warn("Error occurred while filtering file: {}. Ignoring cleanup of this log", path, ex);
243236
return false;
@@ -248,6 +241,7 @@ protected static boolean canDeleteFile(Map<Address, Long> addressToBoundaryTs, P
248241
private static boolean isHMasterWAL(Path path) {
249242
String fn = path.getName();
250243
return fn.startsWith(WALProcedureStore.LOG_PREFIX)
251-
|| fn.endsWith(MasterRegionFactory.ARCHIVED_WAL_SUFFIX);
244+
|| fn.endsWith(MasterRegionFactory.ARCHIVED_WAL_SUFFIX)
245+
|| path.toString().contains(MasterRegionFactory.MASTER_STORE_DIR);
252246
}
253247
}

hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/master/TestBackupLogCleaner.java

Lines changed: 115 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,29 @@
2121
import static org.junit.Assert.assertFalse;
2222
import static org.junit.Assert.assertTrue;
2323

24+
import com.google.common.collect.ImmutableSet;
25+
import java.io.IOException;
26+
import java.util.ArrayList;
2427
import java.util.Collection;
25-
import java.util.Collections;
2628
import java.util.LinkedHashSet;
2729
import java.util.List;
2830
import java.util.Map;
2931
import java.util.Set;
3032
import org.apache.hadoop.fs.FileStatus;
3133
import org.apache.hadoop.fs.Path;
3234
import org.apache.hadoop.hbase.HBaseClassTestRule;
35+
import org.apache.hadoop.hbase.HRegionLocation;
36+
import org.apache.hadoop.hbase.ServerName;
3337
import org.apache.hadoop.hbase.TableName;
3438
import org.apache.hadoop.hbase.backup.BackupType;
3539
import org.apache.hadoop.hbase.backup.TestBackupBase;
3640
import org.apache.hadoop.hbase.backup.impl.BackupSystemTable;
3741
import org.apache.hadoop.hbase.client.Connection;
3842
import org.apache.hadoop.hbase.client.Put;
43+
import org.apache.hadoop.hbase.client.RegionInfo;
3944
import org.apache.hadoop.hbase.client.Table;
4045
import org.apache.hadoop.hbase.master.HMaster;
46+
import org.apache.hadoop.hbase.regionserver.HRegionServer;
4147
import org.apache.hadoop.hbase.testclassification.LargeTests;
4248
import org.apache.hadoop.hbase.util.Bytes;
4349
import org.junit.ClassRule;
@@ -66,6 +72,7 @@ public void testBackupLogCleaner() throws Exception {
6672
List<TableName> tableSetFull = List.of(table1, table2, table3, table4);
6773
List<TableName> tableSet14 = List.of(table1, table4);
6874
List<TableName> tableSet23 = List.of(table2, table3);
75+
TEST_UTIL.getConfiguration().setLong(BackupLogCleaner.TS_BUFFER_KEY, 0);
6976

7077
try (BackupSystemTable systemTable = new BackupSystemTable(TEST_UTIL.getConnection())) {
7178
// Verify that we have no backup sessions yet
@@ -193,35 +200,131 @@ public void testBackupLogCleaner() throws Exception {
193200
// Taking the minimum timestamp (= 2), this means all WALs preceding B3 can be deleted.
194201
deletable = cleaner.getDeletableFiles(walFilesAfterB5);
195202
assertEquals(toSet(walFilesAfterB2), toSet(deletable));
203+
} finally {
204+
TEST_UTIL.truncateTable(BackupSystemTable.getTableName(TEST_UTIL.getConfiguration())).close();
196205
}
197206
}
198207

199-
private Set<FileStatus> mergeAsSet(Collection<FileStatus> toCopy, Collection<FileStatus> toAdd) {
200-
Set<FileStatus> result = new LinkedHashSet<>(toCopy);
201-
result.addAll(toAdd);
202-
return result;
208+
@Test
209+
public void testDoesNotDeleteWALsFromNewServers() throws Exception {
210+
Path backupRoot1 = new Path(BACKUP_ROOT_DIR, "backup1");
211+
List<TableName> tableSetFull = List.of(table1, table2, table3, table4);
212+
213+
try (BackupSystemTable systemTable = new BackupSystemTable(TEST_UTIL.getConnection())) {
214+
LOG.info("Creating initial backup B1");
215+
String backupIdB1 = backupTables(BackupType.FULL, tableSetFull, backupRoot1.toString());
216+
assertTrue(checkSucceeded(backupIdB1));
217+
218+
List<FileStatus> walsAfterB1 = getListOfWALFiles(TEST_UTIL.getConfiguration());
219+
LOG.info("WALs after B1: {}", walsAfterB1.size());
220+
221+
String startCodeStr = systemTable.readBackupStartCode(backupRoot1.toString());
222+
long b1StartCode = Long.parseLong(startCodeStr);
223+
LOG.info("B1 startCode: {}", b1StartCode);
224+
225+
// Add a new RegionServer to the cluster
226+
LOG.info("Adding new RegionServer to cluster");
227+
HRegionServer newRS = TEST_UTIL.getMiniHBaseCluster().startRegionServer().getRegionServer();
228+
ServerName newServerName = newRS.getServerName();
229+
LOG.info("New RegionServer started: {}", newServerName);
230+
231+
// Move a region to the new server to ensure it creates a WAL
232+
List<RegionInfo> regions = TEST_UTIL.getAdmin().getRegions(table1);
233+
RegionInfo regionToMove = regions.get(0);
234+
235+
LOG.info("Moving region {} to new server {}", regionToMove.getEncodedName(), newServerName);
236+
TEST_UTIL.getAdmin().move(regionToMove.getEncodedNameAsBytes(), newServerName);
237+
238+
TEST_UTIL.waitFor(30000, () -> {
239+
try {
240+
HRegionLocation location = TEST_UTIL.getConnection().getRegionLocator(table1)
241+
.getRegionLocation(regionToMove.getStartKey());
242+
return location.getServerName().equals(newServerName);
243+
} catch (IOException e) {
244+
return false;
245+
}
246+
});
247+
248+
// Write some data to trigger WAL creation on the new server
249+
try (Table t1 = TEST_UTIL.getConnection().getTable(table1)) {
250+
for (int i = 0; i < 100; i++) {
251+
Put p = new Put(Bytes.toBytes("newserver-row-" + i));
252+
p.addColumn(famName, qualName, Bytes.toBytes("val" + i));
253+
t1.put(p);
254+
}
255+
}
256+
TEST_UTIL.getAdmin().flushRegion(regionToMove.getEncodedNameAsBytes());
257+
258+
List<FileStatus> walsAfterNewServer = getListOfWALFiles(TEST_UTIL.getConfiguration());
259+
LOG.info("WALs after adding new server: {}", walsAfterNewServer.size());
260+
assertTrue("Should have more WALs after new server",
261+
walsAfterNewServer.size() > walsAfterB1.size());
262+
263+
List<FileStatus> newServerWALs = new ArrayList<>(walsAfterNewServer);
264+
newServerWALs.removeAll(walsAfterB1);
265+
assertFalse("Should have WALs from new server", newServerWALs.isEmpty());
266+
267+
BackupLogCleaner cleaner = new BackupLogCleaner();
268+
cleaner.setConf(TEST_UTIL.getConfiguration());
269+
cleaner.init(Map.of(HMaster.MASTER, TEST_UTIL.getHBaseCluster().getMaster()));
270+
271+
Set<FileStatus> deletable =
272+
ImmutableSet.copyOf(cleaner.getDeletableFiles(walsAfterNewServer));
273+
for (FileStatus newWAL : newServerWALs) {
274+
assertFalse("WAL from new server should NOT be deletable: " + newWAL.getPath(),
275+
deletable.contains(newWAL));
276+
}
277+
} finally {
278+
TEST_UTIL.truncateTable(BackupSystemTable.getTableName(TEST_UTIL.getConfiguration())).close();
279+
}
203280
}
204281

205-
private <T> Set<T> toSet(Iterable<T> iterable) {
206-
Set<T> result = new LinkedHashSet<>();
207-
iterable.forEach(result::add);
208-
return result;
282+
@Test
283+
public void testCanDeleteFileWithNewServerWALs() {
284+
long backupStartCode = 1000000L;
285+
286+
// Old WAL from before the backup
287+
Path oldWAL = new Path("/hbase/oldWALs/server1%2C60020%2C12345.500000");
288+
assertTrue("WAL older than backup should be deletable",
289+
BackupLogCleaner.canDeleteFile(backupStartCode, oldWAL));
290+
291+
// WAL from exactly at the backup boundary
292+
Path boundaryWAL = new Path("/hbase/oldWALs/server1%2C60020%2C12345.1000000");
293+
assertTrue("WAL at boundary should be deletable",
294+
BackupLogCleaner.canDeleteFile(backupStartCode, boundaryWAL));
295+
296+
// WAL from a server that joined AFTER the backup
297+
Path newServerWAL = new Path("/hbase/oldWALs/newserver%2C60020%2C99999.1500000");
298+
assertFalse("WAL from new server (after backup) should NOT be deletable",
299+
BackupLogCleaner.canDeleteFile(backupStartCode, newServerWAL));
209300
}
210301

211302
@Test
212303
public void testCleansUpHMasterWal() {
213304
Path path = new Path("/hbase/MasterData/WALs/hmaster,60000,1718808578163");
214-
assertTrue(BackupLogCleaner.canDeleteFile(Collections.emptyMap(), path));
305+
assertTrue(BackupLogCleaner.canDeleteFile(Long.MIN_VALUE, path));
215306
}
216307

217308
@Test
218309
public void testCleansUpArchivedHMasterWal() {
219310
Path normalPath =
220311
new Path("/hbase/oldWALs/hmaster%2C60000%2C1716224062663.1716247552189$masterlocalwal$");
221-
assertTrue(BackupLogCleaner.canDeleteFile(Collections.emptyMap(), normalPath));
312+
assertTrue(BackupLogCleaner.canDeleteFile(Long.MIN_VALUE, normalPath));
222313

223314
Path masterPath = new Path(
224315
"/hbase/MasterData/oldWALs/hmaster%2C60000%2C1716224062663.1716247552189$masterlocalwal$");
225-
assertTrue(BackupLogCleaner.canDeleteFile(Collections.emptyMap(), masterPath));
316+
assertTrue(BackupLogCleaner.canDeleteFile(Long.MIN_VALUE, masterPath));
317+
}
318+
319+
private Set<FileStatus> mergeAsSet(Collection<FileStatus> toCopy, Collection<FileStatus> toAdd) {
320+
Set<FileStatus> result = new LinkedHashSet<>(toCopy);
321+
result.addAll(toAdd);
322+
return result;
323+
}
324+
325+
private <T> Set<T> toSet(Iterable<T> iterable) {
326+
Set<T> result = new LinkedHashSet<>();
327+
iterable.forEach(result::add);
328+
return result;
226329
}
227330
}

0 commit comments

Comments
 (0)