001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 * 017 */ 018 019/* 020 * This package is based on the work done by Timothy Gerard Endres 021 * (time@ice.com) to whom the Ant project is very grateful for his great code. 022 */ 023 024package org.apache.commons.compress.archivers.tar; 025 026import java.io.ByteArrayOutputStream; 027import java.io.IOException; 028import java.io.InputStream; 029import java.util.HashMap; 030import java.util.Map; 031import java.util.Map.Entry; 032 033import org.apache.commons.compress.archivers.ArchiveEntry; 034import org.apache.commons.compress.archivers.ArchiveInputStream; 035import org.apache.commons.compress.archivers.zip.ZipEncoding; 036import org.apache.commons.compress.archivers.zip.ZipEncodingHelper; 037import org.apache.commons.compress.utils.ArchiveUtils; 038import org.apache.commons.compress.utils.CharsetNames; 039import org.apache.commons.compress.utils.IOUtils; 040 041/** 042 * The TarInputStream reads a UNIX tar archive as an InputStream. 043 * methods are provided to position at each successive entry in 044 * the archive, and the read each entry as a normal input stream 045 * using read(). 046 * @NotThreadSafe 047 */ 048public class TarArchiveInputStream extends ArchiveInputStream { 049 050 private static final int SMALL_BUFFER_SIZE = 256; 051 052 private final byte[] SMALL_BUF = new byte[SMALL_BUFFER_SIZE]; 053 054 /** The size the TAR header */ 055 private final int recordSize; 056 057 /** The size of a block */ 058 private final int blockSize; 059 060 /** True if file has hit EOF */ 061 private boolean hasHitEOF; 062 063 /** Size of the current entry */ 064 private long entrySize; 065 066 /** How far into the entry the stream is at */ 067 private long entryOffset; 068 069 /** An input stream to read from */ 070 private final InputStream is; 071 072 /** The meta-data about the current entry */ 073 private TarArchiveEntry currEntry; 074 075 /** The encoding of the file */ 076 private final ZipEncoding zipEncoding; 077 078 // the provided encoding (for unit tests) 079 final String encoding; 080 081 // the global PAX header 082 private Map<String, String> globalPaxHeaders = new HashMap<String, String>(); 083 084 /** 085 * Constructor for TarInputStream. 086 * @param is the input stream to use 087 */ 088 public TarArchiveInputStream(InputStream is) { 089 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE); 090 } 091 092 /** 093 * Constructor for TarInputStream. 094 * @param is the input stream to use 095 * @param encoding name of the encoding to use for file names 096 * @since 1.4 097 */ 098 public TarArchiveInputStream(InputStream is, String encoding) { 099 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, 100 encoding); 101 } 102 103 /** 104 * Constructor for TarInputStream. 105 * @param is the input stream to use 106 * @param blockSize the block size to use 107 */ 108 public TarArchiveInputStream(InputStream is, int blockSize) { 109 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE); 110 } 111 112 /** 113 * Constructor for TarInputStream. 114 * @param is the input stream to use 115 * @param blockSize the block size to use 116 * @param encoding name of the encoding to use for file names 117 * @since 1.4 118 */ 119 public TarArchiveInputStream(InputStream is, int blockSize, 120 String encoding) { 121 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding); 122 } 123 124 /** 125 * Constructor for TarInputStream. 126 * @param is the input stream to use 127 * @param blockSize the block size to use 128 * @param recordSize the record size to use 129 */ 130 public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) { 131 this(is, blockSize, recordSize, null); 132 } 133 134 /** 135 * Constructor for TarInputStream. 136 * @param is the input stream to use 137 * @param blockSize the block size to use 138 * @param recordSize the record size to use 139 * @param encoding name of the encoding to use for file names 140 * @since 1.4 141 */ 142 public TarArchiveInputStream(InputStream is, int blockSize, int recordSize, 143 String encoding) { 144 this.is = is; 145 this.hasHitEOF = false; 146 this.encoding = encoding; 147 this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); 148 this.recordSize = recordSize; 149 this.blockSize = blockSize; 150 } 151 152 /** 153 * Closes this stream. Calls the TarBuffer's close() method. 154 * @throws IOException on error 155 */ 156 @Override 157 public void close() throws IOException { 158 is.close(); 159 } 160 161 /** 162 * Get the record size being used by this stream's buffer. 163 * 164 * @return The TarBuffer record size. 165 */ 166 public int getRecordSize() { 167 return recordSize; 168 } 169 170 /** 171 * Get the available data that can be read from the current 172 * entry in the archive. This does not indicate how much data 173 * is left in the entire archive, only in the current entry. 174 * This value is determined from the entry's size header field 175 * and the amount of data already read from the current entry. 176 * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE 177 * bytes are left in the current entry in the archive. 178 * 179 * @return The number of available bytes for the current entry. 180 * @throws IOException for signature 181 */ 182 @Override 183 public int available() throws IOException { 184 if (isDirectory()) { 185 return 0; 186 } 187 if (entrySize - entryOffset > Integer.MAX_VALUE) { 188 return Integer.MAX_VALUE; 189 } 190 return (int) (entrySize - entryOffset); 191 } 192 193 194 /** 195 * Skips over and discards <code>n</code> bytes of data from this input 196 * stream. The <code>skip</code> method may, for a variety of reasons, end 197 * up skipping over some smaller number of bytes, possibly <code>0</code>. 198 * This may result from any of a number of conditions; reaching end of file 199 * or end of entry before <code>n</code> bytes have been skipped; are only 200 * two possibilities. The actual number of bytes skipped is returned. If 201 * <code>n</code> is negative, no bytes are skipped. 202 * 203 * 204 * @param n 205 * the number of bytes to be skipped. 206 * @return the actual number of bytes skipped. 207 * @exception IOException 208 * if some other I/O error occurs. 209 */ 210 @Override 211 public long skip(final long n) throws IOException { 212 if (n <= 0 || isDirectory()) { 213 return 0; 214 } 215 216 final long available = entrySize - entryOffset; 217 final long skipped = is.skip(Math.min(n, available)); 218 count(skipped); 219 entryOffset += skipped; 220 return skipped; 221 } 222 223 /** 224 * Since we do not support marking just yet, we return false. 225 * 226 * @return False. 227 */ 228 @Override 229 public boolean markSupported() { 230 return false; 231 } 232 233 /** 234 * Since we do not support marking just yet, we do nothing. 235 * 236 * @param markLimit The limit to mark. 237 */ 238 @Override 239 public void mark(int markLimit) { 240 } 241 242 /** 243 * Since we do not support marking just yet, we do nothing. 244 */ 245 @Override 246 public synchronized void reset() { 247 } 248 249 /** 250 * Get the next entry in this tar archive. This will skip 251 * over any remaining data in the current entry, if there 252 * is one, and place the input stream at the header of the 253 * next entry, and read the header and instantiate a new 254 * TarEntry from the header bytes and return that entry. 255 * If there are no more entries in the archive, null will 256 * be returned to indicate that the end of the archive has 257 * been reached. 258 * 259 * @return The next TarEntry in the archive, or null. 260 * @throws IOException on error 261 */ 262 public TarArchiveEntry getNextTarEntry() throws IOException { 263 if (hasHitEOF) { 264 return null; 265 } 266 267 if (currEntry != null) { 268 /* Skip will only go to the end of the current entry */ 269 IOUtils.skip(this, Long.MAX_VALUE); 270 271 /* skip to the end of the last record */ 272 skipRecordPadding(); 273 } 274 275 byte[] headerBuf = getRecord(); 276 277 if (headerBuf == null) { 278 /* hit EOF */ 279 currEntry = null; 280 return null; 281 } 282 283 try { 284 currEntry = new TarArchiveEntry(headerBuf, zipEncoding); 285 } catch (IllegalArgumentException e) { 286 IOException ioe = new IOException("Error detected parsing the header"); 287 ioe.initCause(e); 288 throw ioe; 289 } 290 291 entryOffset = 0; 292 entrySize = currEntry.getSize(); 293 294 if (currEntry.isGNULongLinkEntry()) { 295 byte[] longLinkData = getLongNameData(); 296 if (longLinkData == null) { 297 // Bugzilla: 40334 298 // Malformed tar file - long link entry name not followed by 299 // entry 300 return null; 301 } 302 currEntry.setLinkName(zipEncoding.decode(longLinkData)); 303 } 304 305 if (currEntry.isGNULongNameEntry()) { 306 byte[] longNameData = getLongNameData(); 307 if (longNameData == null) { 308 // Bugzilla: 40334 309 // Malformed tar file - long entry name not followed by 310 // entry 311 return null; 312 } 313 currEntry.setName(zipEncoding.decode(longNameData)); 314 } 315 316 if (currEntry.isGlobalPaxHeader()){ // Process Global Pax headers 317 readGlobalPaxHeaders(); 318 } 319 320 if (currEntry.isPaxHeader()){ // Process Pax headers 321 paxHeaders(); 322 } else if (!globalPaxHeaders.isEmpty()) { 323 applyPaxHeadersToCurrentEntry(globalPaxHeaders); 324 } 325 326 if (currEntry.isOldGNUSparse()){ // Process sparse files 327 readOldGNUSparse(); 328 } 329 330 // If the size of the next element in the archive has changed 331 // due to a new size being reported in the posix header 332 // information, we update entrySize here so that it contains 333 // the correct value. 334 entrySize = currEntry.getSize(); 335 336 return currEntry; 337 } 338 339 /** 340 * The last record block should be written at the full size, so skip any 341 * additional space used to fill a record after an entry 342 */ 343 private void skipRecordPadding() throws IOException { 344 if (!isDirectory() && this.entrySize > 0 && this.entrySize % this.recordSize != 0) { 345 long numRecords = (this.entrySize / this.recordSize) + 1; 346 long padding = (numRecords * this.recordSize) - this.entrySize; 347 long skipped = IOUtils.skip(is, padding); 348 count(skipped); 349 } 350 } 351 352 /** 353 * Get the next entry in this tar archive as longname data. 354 * 355 * @return The next entry in the archive as longname data, or null. 356 * @throws IOException on error 357 */ 358 protected byte[] getLongNameData() throws IOException { 359 // read in the name 360 ByteArrayOutputStream longName = new ByteArrayOutputStream(); 361 int length = 0; 362 while ((length = read(SMALL_BUF)) >= 0) { 363 longName.write(SMALL_BUF, 0, length); 364 } 365 getNextEntry(); 366 if (currEntry == null) { 367 // Bugzilla: 40334 368 // Malformed tar file - long entry name not followed by entry 369 return null; 370 } 371 byte[] longNameData = longName.toByteArray(); 372 // remove trailing null terminator(s) 373 length = longNameData.length; 374 while (length > 0 && longNameData[length - 1] == 0) { 375 --length; 376 } 377 if (length != longNameData.length) { 378 byte[] l = new byte[length]; 379 System.arraycopy(longNameData, 0, l, 0, length); 380 longNameData = l; 381 } 382 return longNameData; 383 } 384 385 /** 386 * Get the next record in this tar archive. This will skip 387 * over any remaining data in the current entry, if there 388 * is one, and place the input stream at the header of the 389 * next entry. 390 * 391 * <p>If there are no more entries in the archive, null will be 392 * returned to indicate that the end of the archive has been 393 * reached. At the same time the {@code hasHitEOF} marker will be 394 * set to true.</p> 395 * 396 * @return The next header in the archive, or null. 397 * @throws IOException on error 398 */ 399 private byte[] getRecord() throws IOException { 400 byte[] headerBuf = readRecord(); 401 hasHitEOF = isEOFRecord(headerBuf); 402 if (hasHitEOF && headerBuf != null) { 403 tryToConsumeSecondEOFRecord(); 404 consumeRemainderOfLastBlock(); 405 headerBuf = null; 406 } 407 return headerBuf; 408 } 409 410 /** 411 * Determine if an archive record indicate End of Archive. End of 412 * archive is indicated by a record that consists entirely of null bytes. 413 * 414 * @param record The record data to check. 415 * @return true if the record data is an End of Archive 416 */ 417 protected boolean isEOFRecord(byte[] record) { 418 return record == null || ArchiveUtils.isArrayZero(record, recordSize); 419 } 420 421 /** 422 * Read a record from the input stream and return the data. 423 * 424 * @return The record data or null if EOF has been hit. 425 * @throws IOException on error 426 */ 427 protected byte[] readRecord() throws IOException { 428 429 byte[] record = new byte[recordSize]; 430 431 int readNow = IOUtils.readFully(is, record); 432 count(readNow); 433 if (readNow != recordSize) { 434 return null; 435 } 436 437 return record; 438 } 439 440 private void readGlobalPaxHeaders() throws IOException { 441 globalPaxHeaders = parsePaxHeaders(this); 442 getNextEntry(); // Get the actual file entry 443 } 444 445 private void paxHeaders() throws IOException{ 446 Map<String, String> headers = parsePaxHeaders(this); 447 getNextEntry(); // Get the actual file entry 448 applyPaxHeadersToCurrentEntry(headers); 449 } 450 451 // NOTE, using a Map here makes it impossible to ever support GNU 452 // sparse files using the PAX Format 0.0, see 453 // https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188 454 Map<String, String> parsePaxHeaders(InputStream i) 455 throws IOException { 456 Map<String, String> headers = new HashMap<String, String>(globalPaxHeaders); 457 // Format is "length keyword=value\n"; 458 while(true){ // get length 459 int ch; 460 int len = 0; 461 int read = 0; 462 while((ch = i.read()) != -1) { 463 read++; 464 if (ch == ' '){ // End of length string 465 // Get keyword 466 ByteArrayOutputStream coll = new ByteArrayOutputStream(); 467 while((ch = i.read()) != -1) { 468 read++; 469 if (ch == '='){ // end of keyword 470 String keyword = coll.toString(CharsetNames.UTF_8); 471 // Get rest of entry 472 final int restLen = len - read; 473 if (restLen == 1) { // only NL 474 headers.remove(keyword); 475 } else { 476 byte[] rest = new byte[restLen]; 477 int got = IOUtils.readFully(i, rest); 478 if (got != restLen) { 479 throw new IOException("Failed to read " 480 + "Paxheader. Expected " 481 + restLen 482 + " bytes, read " 483 + got); 484 } 485 // Drop trailing NL 486 String value = new String(rest, 0, 487 restLen - 1, CharsetNames.UTF_8); 488 headers.put(keyword, value); 489 } 490 break; 491 } 492 coll.write((byte) ch); 493 } 494 break; // Processed single header 495 } 496 len *= 10; 497 len += ch - '0'; 498 } 499 if (ch == -1){ // EOF 500 break; 501 } 502 } 503 return headers; 504 } 505 506 private void applyPaxHeadersToCurrentEntry(Map<String, String> headers) { 507 /* 508 * The following headers are defined for Pax. 509 * atime, ctime, charset: cannot use these without changing TarArchiveEntry fields 510 * mtime 511 * comment 512 * gid, gname 513 * linkpath 514 * size 515 * uid,uname 516 * SCHILY.devminor, SCHILY.devmajor: don't have setters/getters for those 517 * 518 * GNU sparse files use additional members, we use 519 * GNU.sparse.size to detect the 0.0 and 0.1 versions and 520 * GNU.sparse.realsize for 1.0. 521 * 522 * star files use additional members of which we use 523 * SCHILY.filetype in order to detect star sparse files. 524 */ 525 for (Entry<String, String> ent : headers.entrySet()){ 526 String key = ent.getKey(); 527 String val = ent.getValue(); 528 if ("path".equals(key)){ 529 currEntry.setName(val); 530 } else if ("linkpath".equals(key)){ 531 currEntry.setLinkName(val); 532 } else if ("gid".equals(key)){ 533 currEntry.setGroupId(Long.parseLong(val)); 534 } else if ("gname".equals(key)){ 535 currEntry.setGroupName(val); 536 } else if ("uid".equals(key)){ 537 currEntry.setUserId(Long.parseLong(val)); 538 } else if ("uname".equals(key)){ 539 currEntry.setUserName(val); 540 } else if ("size".equals(key)){ 541 currEntry.setSize(Long.parseLong(val)); 542 } else if ("mtime".equals(key)){ 543 currEntry.setModTime((long) (Double.parseDouble(val) * 1000)); 544 } else if ("SCHILY.devminor".equals(key)){ 545 currEntry.setDevMinor(Integer.parseInt(val)); 546 } else if ("SCHILY.devmajor".equals(key)){ 547 currEntry.setDevMajor(Integer.parseInt(val)); 548 } else if ("GNU.sparse.size".equals(key)) { 549 currEntry.fillGNUSparse0xData(headers); 550 } else if ("GNU.sparse.realsize".equals(key)) { 551 currEntry.fillGNUSparse1xData(headers); 552 } else if ("SCHILY.filetype".equals(key) && "sparse".equals(val)) { 553 currEntry.fillStarSparseData(headers); 554 } 555 } 556 } 557 558 /** 559 * Adds the sparse chunks from the current entry to the sparse chunks, 560 * including any additional sparse entries following the current entry. 561 * 562 * @throws IOException on error 563 * 564 * @todo Sparse files get not yet really processed. 565 */ 566 private void readOldGNUSparse() throws IOException { 567 /* we do not really process sparse files yet 568 sparses = new ArrayList(); 569 sparses.addAll(currEntry.getSparses()); 570 */ 571 if (currEntry.isExtended()) { 572 TarArchiveSparseEntry entry; 573 do { 574 byte[] headerBuf = getRecord(); 575 if (headerBuf == null) { 576 currEntry = null; 577 break; 578 } 579 entry = new TarArchiveSparseEntry(headerBuf); 580 /* we do not really process sparse files yet 581 sparses.addAll(entry.getSparses()); 582 */ 583 } while (entry.isExtended()); 584 } 585 } 586 587 private boolean isDirectory() { 588 return currEntry != null && currEntry.isDirectory(); 589 } 590 591 /** 592 * Returns the next Archive Entry in this Stream. 593 * 594 * @return the next entry, 595 * or {@code null} if there are no more entries 596 * @throws IOException if the next entry could not be read 597 */ 598 @Override 599 public ArchiveEntry getNextEntry() throws IOException { 600 return getNextTarEntry(); 601 } 602 603 /** 604 * Tries to read the next record rewinding the stream if it is not a EOF record. 605 * 606 * <p>This is meant to protect against cases where a tar 607 * implementation has written only one EOF record when two are 608 * expected. Actually this won't help since a non-conforming 609 * implementation likely won't fill full blocks consisting of - by 610 * default - ten records either so we probably have already read 611 * beyond the archive anyway.</p> 612 */ 613 private void tryToConsumeSecondEOFRecord() throws IOException { 614 boolean shouldReset = true; 615 boolean marked = is.markSupported(); 616 if (marked) { 617 is.mark(recordSize); 618 } 619 try { 620 shouldReset = !isEOFRecord(readRecord()); 621 } finally { 622 if (shouldReset && marked) { 623 pushedBackBytes(recordSize); 624 is.reset(); 625 } 626 } 627 } 628 629 /** 630 * Reads bytes from the current tar archive entry. 631 * 632 * This method is aware of the boundaries of the current 633 * entry in the archive and will deal with them as if they 634 * were this stream's start and EOF. 635 * 636 * @param buf The buffer into which to place bytes read. 637 * @param offset The offset at which to place bytes read. 638 * @param numToRead The number of bytes to read. 639 * @return The number of bytes read, or -1 at EOF. 640 * @throws IOException on error 641 */ 642 @Override 643 public int read(byte[] buf, int offset, int numToRead) throws IOException { 644 int totalRead = 0; 645 646 if (hasHitEOF || isDirectory() || entryOffset >= entrySize) { 647 return -1; 648 } 649 650 if (currEntry == null) { 651 throw new IllegalStateException("No current tar entry"); 652 } 653 654 numToRead = Math.min(numToRead, available()); 655 656 totalRead = is.read(buf, offset, numToRead); 657 658 if (totalRead == -1) { 659 if (numToRead > 0) { 660 throw new IOException("Truncated TAR archive"); 661 } 662 hasHitEOF = true; 663 } else { 664 count(totalRead); 665 entryOffset += totalRead; 666 } 667 668 return totalRead; 669 } 670 671 /** 672 * Whether this class is able to read the given entry. 673 * 674 * <p>May return false if the current entry is a sparse file.</p> 675 */ 676 @Override 677 public boolean canReadEntryData(ArchiveEntry ae) { 678 if (ae instanceof TarArchiveEntry) { 679 TarArchiveEntry te = (TarArchiveEntry) ae; 680 return !te.isSparse(); 681 } 682 return false; 683 } 684 685 /** 686 * Get the current TAR Archive Entry that this input stream is processing 687 * 688 * @return The current Archive Entry 689 */ 690 public TarArchiveEntry getCurrentEntry() { 691 return currEntry; 692 } 693 694 protected final void setCurrentEntry(TarArchiveEntry e) { 695 currEntry = e; 696 } 697 698 protected final boolean isAtEOF() { 699 return hasHitEOF; 700 } 701 702 protected final void setAtEOF(boolean b) { 703 hasHitEOF = b; 704 } 705 706 /** 707 * This method is invoked once the end of the archive is hit, it 708 * tries to consume the remaining bytes under the assumption that 709 * the tool creating this archive has padded the last block. 710 */ 711 private void consumeRemainderOfLastBlock() throws IOException { 712 long bytesReadOfLastBlock = getBytesRead() % blockSize; 713 if (bytesReadOfLastBlock > 0) { 714 long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock); 715 count(skipped); 716 } 717 } 718 719 /** 720 * Checks if the signature matches what is expected for a tar file. 721 * 722 * @param signature 723 * the bytes to check 724 * @param length 725 * the number of bytes to check 726 * @return true, if this stream is a tar archive stream, false otherwise 727 */ 728 public static boolean matches(byte[] signature, int length) { 729 if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) { 730 return false; 731 } 732 733 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, 734 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 735 && 736 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, 737 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 738 ){ 739 return true; 740 } 741 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, 742 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 743 && 744 ( 745 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, 746 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 747 || 748 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, 749 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 750 ) 751 ){ 752 return true; 753 } 754 // COMPRESS-107 - recognise Ant tar files 755 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, 756 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 757 && 758 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, 759 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 760 ){ 761 return true; 762 } 763 return false; 764 } 765 766}