...

Source file src/encoding/csv/reader.go

Documentation: encoding/csv

		 1  // Copyright 2011 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Package csv reads and writes comma-separated values (CSV) files.
		 6  // There are many kinds of CSV files; this package supports the format
		 7  // described in RFC 4180.
		 8  //
		 9  // A csv file contains zero or more records of one or more fields per record.
		10  // Each record is separated by the newline character. The final record may
		11  // optionally be followed by a newline character.
		12  //
		13  //	field1,field2,field3
		14  //
		15  // White space is considered part of a field.
		16  //
		17  // Carriage returns before newline characters are silently removed.
		18  //
		19  // Blank lines are ignored. A line with only whitespace characters (excluding
		20  // the ending newline character) is not considered a blank line.
		21  //
		22  // Fields which start and stop with the quote character " are called
		23  // quoted-fields. The beginning and ending quote are not part of the
		24  // field.
		25  //
		26  // The source:
		27  //
		28  //	normal string,"quoted-field"
		29  //
		30  // results in the fields
		31  //
		32  //	{`normal string`, `quoted-field`}
		33  //
		34  // Within a quoted-field a quote character followed by a second quote
		35  // character is considered a single quote.
		36  //
		37  //	"the ""word"" is true","a ""quoted-field"""
		38  //
		39  // results in
		40  //
		41  //	{`the "word" is true`, `a "quoted-field"`}
		42  //
		43  // Newlines and commas may be included in a quoted-field
		44  //
		45  //	"Multi-line
		46  //	field","comma is ,"
		47  //
		48  // results in
		49  //
		50  //	{`Multi-line
		51  //	field`, `comma is ,`}
		52  package csv
		53  
		54  import (
		55  	"bufio"
		56  	"bytes"
		57  	"errors"
		58  	"fmt"
		59  	"io"
		60  	"unicode"
		61  	"unicode/utf8"
		62  )
		63  
		64  // A ParseError is returned for parsing errors.
		65  // Line numbers are 1-indexed and columns are 0-indexed.
		66  type ParseError struct {
		67  	StartLine int	 // Line where the record starts
		68  	Line			int	 // Line where the error occurred
		69  	Column		int	 // Column (1-based byte index) where the error occurred
		70  	Err			 error // The actual error
		71  }
		72  
		73  func (e *ParseError) Error() string {
		74  	if e.Err == ErrFieldCount {
		75  		return fmt.Sprintf("record on line %d: %v", e.Line, e.Err)
		76  	}
		77  	if e.StartLine != e.Line {
		78  		return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.StartLine, e.Line, e.Column, e.Err)
		79  	}
		80  	return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err)
		81  }
		82  
		83  func (e *ParseError) Unwrap() error { return e.Err }
		84  
		85  // These are the errors that can be returned in ParseError.Err.
		86  var (
		87  	ErrTrailingComma = errors.New("extra delimiter at end of line") // Deprecated: No longer used.
		88  	ErrBareQuote		 = errors.New("bare \" in non-quoted-field")
		89  	ErrQuote				 = errors.New("extraneous or missing \" in quoted-field")
		90  	ErrFieldCount		= errors.New("wrong number of fields")
		91  )
		92  
		93  var errInvalidDelim = errors.New("csv: invalid field or comment delimiter")
		94  
		95  func validDelim(r rune) bool {
		96  	return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
		97  }
		98  
		99  // A Reader reads records from a CSV-encoded file.
	 100  //
	 101  // As returned by NewReader, a Reader expects input conforming to RFC 4180.
	 102  // The exported fields can be changed to customize the details before the
	 103  // first call to Read or ReadAll.
	 104  //
	 105  // The Reader converts all \r\n sequences in its input to plain \n,
	 106  // including in multiline field values, so that the returned data does
	 107  // not depend on which line-ending convention an input file uses.
	 108  type Reader struct {
	 109  	// Comma is the field delimiter.
	 110  	// It is set to comma (',') by NewReader.
	 111  	// Comma must be a valid rune and must not be \r, \n,
	 112  	// or the Unicode replacement character (0xFFFD).
	 113  	Comma rune
	 114  
	 115  	// Comment, if not 0, is the comment character. Lines beginning with the
	 116  	// Comment character without preceding whitespace are ignored.
	 117  	// With leading whitespace the Comment character becomes part of the
	 118  	// field, even if TrimLeadingSpace is true.
	 119  	// Comment must be a valid rune and must not be \r, \n,
	 120  	// or the Unicode replacement character (0xFFFD).
	 121  	// It must also not be equal to Comma.
	 122  	Comment rune
	 123  
	 124  	// FieldsPerRecord is the number of expected fields per record.
	 125  	// If FieldsPerRecord is positive, Read requires each record to
	 126  	// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
	 127  	// the number of fields in the first record, so that future records must
	 128  	// have the same field count. If FieldsPerRecord is negative, no check is
	 129  	// made and records may have a variable number of fields.
	 130  	FieldsPerRecord int
	 131  
	 132  	// If LazyQuotes is true, a quote may appear in an unquoted field and a
	 133  	// non-doubled quote may appear in a quoted field.
	 134  	LazyQuotes bool
	 135  
	 136  	// If TrimLeadingSpace is true, leading white space in a field is ignored.
	 137  	// This is done even if the field delimiter, Comma, is white space.
	 138  	TrimLeadingSpace bool
	 139  
	 140  	// ReuseRecord controls whether calls to Read may return a slice sharing
	 141  	// the backing array of the previous call's returned slice for performance.
	 142  	// By default, each call to Read returns newly allocated memory owned by the caller.
	 143  	ReuseRecord bool
	 144  
	 145  	TrailingComma bool // Deprecated: No longer used.
	 146  
	 147  	r *bufio.Reader
	 148  
	 149  	// numLine is the current line being read in the CSV file.
	 150  	numLine int
	 151  
	 152  	// rawBuffer is a line buffer only used by the readLine method.
	 153  	rawBuffer []byte
	 154  
	 155  	// recordBuffer holds the unescaped fields, one after another.
	 156  	// The fields can be accessed by using the indexes in fieldIndexes.
	 157  	// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
	 158  	// and fieldIndexes will contain the indexes [1, 2, 5, 6].
	 159  	recordBuffer []byte
	 160  
	 161  	// fieldIndexes is an index of fields inside recordBuffer.
	 162  	// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
	 163  	fieldIndexes []int
	 164  
	 165  	// fieldPositions is an index of field positions for the
	 166  	// last record returned by Read.
	 167  	fieldPositions []position
	 168  
	 169  	// lastRecord is a record cache and only used when ReuseRecord == true.
	 170  	lastRecord []string
	 171  }
	 172  
	 173  // NewReader returns a new Reader that reads from r.
	 174  func NewReader(r io.Reader) *Reader {
	 175  	return &Reader{
	 176  		Comma: ',',
	 177  		r:		 bufio.NewReader(r),
	 178  	}
	 179  }
	 180  
	 181  // Read reads one record (a slice of fields) from r.
	 182  // If the record has an unexpected number of fields,
	 183  // Read returns the record along with the error ErrFieldCount.
	 184  // Except for that case, Read always returns either a non-nil
	 185  // record or a non-nil error, but not both.
	 186  // If there is no data left to be read, Read returns nil, io.EOF.
	 187  // If ReuseRecord is true, the returned slice may be shared
	 188  // between multiple calls to Read.
	 189  func (r *Reader) Read() (record []string, err error) {
	 190  	if r.ReuseRecord {
	 191  		record, err = r.readRecord(r.lastRecord)
	 192  		r.lastRecord = record
	 193  	} else {
	 194  		record, err = r.readRecord(nil)
	 195  	}
	 196  	return record, err
	 197  }
	 198  
	 199  // FieldPos returns the line and column corresponding to
	 200  // the start of the field with the given index in the slice most recently
	 201  // returned by Read. Numbering of lines and columns starts at 1;
	 202  // columns are counted in bytes, not runes.
	 203  //
	 204  // If this is called with an out-of-bounds index, it panics.
	 205  func (r *Reader) FieldPos(field int) (line, column int) {
	 206  	if field < 0 || field >= len(r.fieldPositions) {
	 207  		panic("out of range index passed to FieldPos")
	 208  	}
	 209  	p := &r.fieldPositions[field]
	 210  	return p.line, p.col
	 211  }
	 212  
	 213  // pos holds the position of a field in the current line.
	 214  type position struct {
	 215  	line, col int
	 216  }
	 217  
	 218  // ReadAll reads all the remaining records from r.
	 219  // Each record is a slice of fields.
	 220  // A successful call returns err == nil, not err == io.EOF. Because ReadAll is
	 221  // defined to read until EOF, it does not treat end of file as an error to be
	 222  // reported.
	 223  func (r *Reader) ReadAll() (records [][]string, err error) {
	 224  	for {
	 225  		record, err := r.readRecord(nil)
	 226  		if err == io.EOF {
	 227  			return records, nil
	 228  		}
	 229  		if err != nil {
	 230  			return nil, err
	 231  		}
	 232  		records = append(records, record)
	 233  	}
	 234  }
	 235  
	 236  // readLine reads the next line (with the trailing endline).
	 237  // If EOF is hit without a trailing endline, it will be omitted.
	 238  // If some bytes were read, then the error is never io.EOF.
	 239  // The result is only valid until the next call to readLine.
	 240  func (r *Reader) readLine() ([]byte, error) {
	 241  	line, err := r.r.ReadSlice('\n')
	 242  	if err == bufio.ErrBufferFull {
	 243  		r.rawBuffer = append(r.rawBuffer[:0], line...)
	 244  		for err == bufio.ErrBufferFull {
	 245  			line, err = r.r.ReadSlice('\n')
	 246  			r.rawBuffer = append(r.rawBuffer, line...)
	 247  		}
	 248  		line = r.rawBuffer
	 249  	}
	 250  	if len(line) > 0 && err == io.EOF {
	 251  		err = nil
	 252  		// For backwards compatibility, drop trailing \r before EOF.
	 253  		if line[len(line)-1] == '\r' {
	 254  			line = line[:len(line)-1]
	 255  		}
	 256  	}
	 257  	r.numLine++
	 258  	// Normalize \r\n to \n on all input lines.
	 259  	if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {
	 260  		line[n-2] = '\n'
	 261  		line = line[:n-1]
	 262  	}
	 263  	return line, err
	 264  }
	 265  
	 266  // lengthNL reports the number of bytes for the trailing \n.
	 267  func lengthNL(b []byte) int {
	 268  	if len(b) > 0 && b[len(b)-1] == '\n' {
	 269  		return 1
	 270  	}
	 271  	return 0
	 272  }
	 273  
	 274  // nextRune returns the next rune in b or utf8.RuneError.
	 275  func nextRune(b []byte) rune {
	 276  	r, _ := utf8.DecodeRune(b)
	 277  	return r
	 278  }
	 279  
	 280  func (r *Reader) readRecord(dst []string) ([]string, error) {
	 281  	if r.Comma == r.Comment || !validDelim(r.Comma) || (r.Comment != 0 && !validDelim(r.Comment)) {
	 282  		return nil, errInvalidDelim
	 283  	}
	 284  
	 285  	// Read line (automatically skipping past empty lines and any comments).
	 286  	var line []byte
	 287  	var errRead error
	 288  	for errRead == nil {
	 289  		line, errRead = r.readLine()
	 290  		if r.Comment != 0 && nextRune(line) == r.Comment {
	 291  			line = nil
	 292  			continue // Skip comment lines
	 293  		}
	 294  		if errRead == nil && len(line) == lengthNL(line) {
	 295  			line = nil
	 296  			continue // Skip empty lines
	 297  		}
	 298  		break
	 299  	}
	 300  	if errRead == io.EOF {
	 301  		return nil, errRead
	 302  	}
	 303  
	 304  	// Parse each field in the record.
	 305  	var err error
	 306  	const quoteLen = len(`"`)
	 307  	commaLen := utf8.RuneLen(r.Comma)
	 308  	recLine := r.numLine // Starting line for record
	 309  	r.recordBuffer = r.recordBuffer[:0]
	 310  	r.fieldIndexes = r.fieldIndexes[:0]
	 311  	r.fieldPositions = r.fieldPositions[:0]
	 312  	pos := position{line: r.numLine, col: 1}
	 313  parseField:
	 314  	for {
	 315  		if r.TrimLeadingSpace {
	 316  			i := bytes.IndexFunc(line, func(r rune) bool {
	 317  				return !unicode.IsSpace(r)
	 318  			})
	 319  			if i < 0 {
	 320  				i = len(line)
	 321  				pos.col -= lengthNL(line)
	 322  			}
	 323  			line = line[i:]
	 324  			pos.col += i
	 325  		}
	 326  		if len(line) == 0 || line[0] != '"' {
	 327  			// Non-quoted string field
	 328  			i := bytes.IndexRune(line, r.Comma)
	 329  			field := line
	 330  			if i >= 0 {
	 331  				field = field[:i]
	 332  			} else {
	 333  				field = field[:len(field)-lengthNL(field)]
	 334  			}
	 335  			// Check to make sure a quote does not appear in field.
	 336  			if !r.LazyQuotes {
	 337  				if j := bytes.IndexByte(field, '"'); j >= 0 {
	 338  					col := pos.col + j
	 339  					err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
	 340  					break parseField
	 341  				}
	 342  			}
	 343  			r.recordBuffer = append(r.recordBuffer, field...)
	 344  			r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
	 345  			r.fieldPositions = append(r.fieldPositions, pos)
	 346  			if i >= 0 {
	 347  				line = line[i+commaLen:]
	 348  				pos.col += i + commaLen
	 349  				continue parseField
	 350  			}
	 351  			break parseField
	 352  		} else {
	 353  			// Quoted string field
	 354  			fieldPos := pos
	 355  			line = line[quoteLen:]
	 356  			pos.col += quoteLen
	 357  			for {
	 358  				i := bytes.IndexByte(line, '"')
	 359  				if i >= 0 {
	 360  					// Hit next quote.
	 361  					r.recordBuffer = append(r.recordBuffer, line[:i]...)
	 362  					line = line[i+quoteLen:]
	 363  					pos.col += i + quoteLen
	 364  					switch rn := nextRune(line); {
	 365  					case rn == '"':
	 366  						// `""` sequence (append quote).
	 367  						r.recordBuffer = append(r.recordBuffer, '"')
	 368  						line = line[quoteLen:]
	 369  						pos.col += quoteLen
	 370  					case rn == r.Comma:
	 371  						// `",` sequence (end of field).
	 372  						line = line[commaLen:]
	 373  						pos.col += commaLen
	 374  						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
	 375  						r.fieldPositions = append(r.fieldPositions, fieldPos)
	 376  						continue parseField
	 377  					case lengthNL(line) == len(line):
	 378  						// `"\n` sequence (end of line).
	 379  						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
	 380  						r.fieldPositions = append(r.fieldPositions, fieldPos)
	 381  						break parseField
	 382  					case r.LazyQuotes:
	 383  						// `"` sequence (bare quote).
	 384  						r.recordBuffer = append(r.recordBuffer, '"')
	 385  					default:
	 386  						// `"*` sequence (invalid non-escaped quote).
	 387  						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: pos.col - quoteLen, Err: ErrQuote}
	 388  						break parseField
	 389  					}
	 390  				} else if len(line) > 0 {
	 391  					// Hit end of line (copy all data so far).
	 392  					r.recordBuffer = append(r.recordBuffer, line...)
	 393  					if errRead != nil {
	 394  						break parseField
	 395  					}
	 396  					pos.col += len(line)
	 397  					line, errRead = r.readLine()
	 398  					if len(line) > 0 {
	 399  						pos.line++
	 400  						pos.col = 1
	 401  					}
	 402  					if errRead == io.EOF {
	 403  						errRead = nil
	 404  					}
	 405  				} else {
	 406  					// Abrupt end of file (EOF or error).
	 407  					if !r.LazyQuotes && errRead == nil {
	 408  						err = &ParseError{StartLine: recLine, Line: pos.line, Column: pos.col, Err: ErrQuote}
	 409  						break parseField
	 410  					}
	 411  					r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
	 412  					r.fieldPositions = append(r.fieldPositions, fieldPos)
	 413  					break parseField
	 414  				}
	 415  			}
	 416  		}
	 417  	}
	 418  	if err == nil {
	 419  		err = errRead
	 420  	}
	 421  
	 422  	// Create a single string and create slices out of it.
	 423  	// This pins the memory of the fields together, but allocates once.
	 424  	str := string(r.recordBuffer) // Convert to string once to batch allocations
	 425  	dst = dst[:0]
	 426  	if cap(dst) < len(r.fieldIndexes) {
	 427  		dst = make([]string, len(r.fieldIndexes))
	 428  	}
	 429  	dst = dst[:len(r.fieldIndexes)]
	 430  	var preIdx int
	 431  	for i, idx := range r.fieldIndexes {
	 432  		dst[i] = str[preIdx:idx]
	 433  		preIdx = idx
	 434  	}
	 435  
	 436  	// Check or update the expected fields per record.
	 437  	if r.FieldsPerRecord > 0 {
	 438  		if len(dst) != r.FieldsPerRecord && err == nil {
	 439  			err = &ParseError{
	 440  				StartLine: recLine,
	 441  				Line:			recLine,
	 442  				Column:		1,
	 443  				Err:			 ErrFieldCount,
	 444  			}
	 445  		}
	 446  	} else if r.FieldsPerRecord == 0 {
	 447  		r.FieldsPerRecord = len(dst)
	 448  	}
	 449  	return dst, err
	 450  }
	 451  

View as plain text