1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package runtime 6 7 import ( 8 "runtime/internal/atomic" 9 "unsafe" 10 ) 11 12 // A profBuf is a lock-free buffer for profiling events, 13 // safe for concurrent use by one reader and one writer. 14 // The writer may be a signal handler running without a user g. 15 // The reader is assumed to be a user g. 16 // 17 // Each logged event corresponds to a fixed size header, a list of 18 // uintptrs (typically a stack), and exactly one unsafe.Pointer tag. 19 // The header and uintptrs are stored in the circular buffer data and the 20 // tag is stored in a circular buffer tags, running in parallel. 21 // In the circular buffer data, each event takes 2+hdrsize+len(stk) 22 // words: the value 2+hdrsize+len(stk), then the time of the event, then 23 // hdrsize words giving the fixed-size header, and then len(stk) words 24 // for the stack. 25 // 26 // The current effective offsets into the tags and data circular buffers 27 // for reading and writing are stored in the high 30 and low 32 bits of r and w. 28 // The bottom bits of the high 32 are additional flag bits in w, unused in r. 29 // "Effective" offsets means the total number of reads or writes, mod 2^length. 30 // The offset in the buffer is the effective offset mod the length of the buffer. 31 // To make wraparound mod 2^length match wraparound mod length of the buffer, 32 // the length of the buffer must be a power of two. 33 // 34 // If the reader catches up to the writer, a flag passed to read controls 35 // whether the read blocks until more data is available. A read returns a 36 // pointer to the buffer data itself; the caller is assumed to be done with 37 // that data at the next read. The read offset rNext tracks the next offset to 38 // be returned by read. By definition, r ≤ rNext ≤ w (before wraparound), 39 // and rNext is only used by the reader, so it can be accessed without atomics. 40 // 41 // If the writer gets ahead of the reader, so that the buffer fills, 42 // future writes are discarded and replaced in the output stream by an 43 // overflow entry, which has size 2+hdrsize+1, time set to the time of 44 // the first discarded write, a header of all zeroed words, and a "stack" 45 // containing one word, the number of discarded writes. 46 // 47 // Between the time the buffer fills and the buffer becomes empty enough 48 // to hold more data, the overflow entry is stored as a pending overflow 49 // entry in the fields overflow and overflowTime. The pending overflow 50 // entry can be turned into a real record by either the writer or the 51 // reader. If the writer is called to write a new record and finds that 52 // the output buffer has room for both the pending overflow entry and the 53 // new record, the writer emits the pending overflow entry and the new 54 // record into the buffer. If the reader is called to read data and finds 55 // that the output buffer is empty but that there is a pending overflow 56 // entry, the reader will return a synthesized record for the pending 57 // overflow entry. 58 // 59 // Only the writer can create or add to a pending overflow entry, but 60 // either the reader or the writer can clear the pending overflow entry. 61 // A pending overflow entry is indicated by the low 32 bits of 'overflow' 62 // holding the number of discarded writes, and overflowTime holding the 63 // time of the first discarded write. The high 32 bits of 'overflow' 64 // increment each time the low 32 bits transition from zero to non-zero 65 // or vice versa. This sequence number avoids ABA problems in the use of 66 // compare-and-swap to coordinate between reader and writer. 67 // The overflowTime is only written when the low 32 bits of overflow are 68 // zero, that is, only when there is no pending overflow entry, in 69 // preparation for creating a new one. The reader can therefore fetch and 70 // clear the entry atomically using 71 // 72 // for { 73 // overflow = load(&b.overflow) 74 // if uint32(overflow) == 0 { 75 // // no pending entry 76 // break 77 // } 78 // time = load(&b.overflowTime) 79 // if cas(&b.overflow, overflow, ((overflow>>32)+1)<<32) { 80 // // pending entry cleared 81 // break 82 // } 83 // } 84 // if uint32(overflow) > 0 { 85 // emit entry for uint32(overflow), time 86 // } 87 // 88 type profBuf struct { 89 // accessed atomically 90 r, w profAtomic 91 overflow uint64 92 overflowTime uint64 93 eof uint32 94 95 // immutable (excluding slice content) 96 hdrsize uintptr 97 data []uint64 98 tags []unsafe.Pointer 99 100 // owned by reader 101 rNext profIndex 102 overflowBuf []uint64 // for use by reader to return overflow record 103 wait note 104 } 105 106 // A profAtomic is the atomically-accessed word holding a profIndex. 107 type profAtomic uint64 108 109 // A profIndex is the packet tag and data counts and flags bits, described above. 110 type profIndex uint64 111 112 const ( 113 profReaderSleeping profIndex = 1 << 32 // reader is sleeping and must be woken up 114 profWriteExtra profIndex = 1 << 33 // overflow or eof waiting 115 ) 116 117 func (x *profAtomic) load() profIndex { 118 return profIndex(atomic.Load64((*uint64)(x))) 119 } 120 121 func (x *profAtomic) store(new profIndex) { 122 atomic.Store64((*uint64)(x), uint64(new)) 123 } 124 125 func (x *profAtomic) cas(old, new profIndex) bool { 126 return atomic.Cas64((*uint64)(x), uint64(old), uint64(new)) 127 } 128 129 func (x profIndex) dataCount() uint32 { 130 return uint32(x) 131 } 132 133 func (x profIndex) tagCount() uint32 { 134 return uint32(x >> 34) 135 } 136 137 // countSub subtracts two counts obtained from profIndex.dataCount or profIndex.tagCount, 138 // assuming that they are no more than 2^29 apart (guaranteed since they are never more than 139 // len(data) or len(tags) apart, respectively). 140 // tagCount wraps at 2^30, while dataCount wraps at 2^32. 141 // This function works for both. 142 func countSub(x, y uint32) int { 143 // x-y is 32-bit signed or 30-bit signed; sign-extend to 32 bits and convert to int. 144 return int(int32(x-y) << 2 >> 2) 145 } 146 147 // addCountsAndClearFlags returns the packed form of "x + (data, tag) - all flags". 148 func (x profIndex) addCountsAndClearFlags(data, tag int) profIndex { 149 return profIndex((uint64(x)>>34+uint64(uint32(tag)<<2>>2))<<34 | uint64(uint32(x)+uint32(data))) 150 } 151 152 // hasOverflow reports whether b has any overflow records pending. 153 func (b *profBuf) hasOverflow() bool { 154 return uint32(atomic.Load64(&b.overflow)) > 0 155 } 156 157 // takeOverflow consumes the pending overflow records, returning the overflow count 158 // and the time of the first overflow. 159 // When called by the reader, it is racing against incrementOverflow. 160 func (b *profBuf) takeOverflow() (count uint32, time uint64) { 161 overflow := atomic.Load64(&b.overflow) 162 time = atomic.Load64(&b.overflowTime) 163 for { 164 count = uint32(overflow) 165 if count == 0 { 166 time = 0 167 break 168 } 169 // Increment generation, clear overflow count in low bits. 170 if atomic.Cas64(&b.overflow, overflow, ((overflow>>32)+1)<<32) { 171 break 172 } 173 overflow = atomic.Load64(&b.overflow) 174 time = atomic.Load64(&b.overflowTime) 175 } 176 return uint32(overflow), time 177 } 178 179 // incrementOverflow records a single overflow at time now. 180 // It is racing against a possible takeOverflow in the reader. 181 func (b *profBuf) incrementOverflow(now int64) { 182 for { 183 overflow := atomic.Load64(&b.overflow) 184 185 // Once we see b.overflow reach 0, it's stable: no one else is changing it underfoot. 186 // We need to set overflowTime if we're incrementing b.overflow from 0. 187 if uint32(overflow) == 0 { 188 // Store overflowTime first so it's always available when overflow != 0. 189 atomic.Store64(&b.overflowTime, uint64(now)) 190 atomic.Store64(&b.overflow, (((overflow>>32)+1)<<32)+1) 191 break 192 } 193 // Otherwise we're racing to increment against reader 194 // who wants to set b.overflow to 0. 195 // Out of paranoia, leave 2³²-1 a sticky overflow value, 196 // to avoid wrapping around. Extremely unlikely. 197 if int32(overflow) == -1 { 198 break 199 } 200 if atomic.Cas64(&b.overflow, overflow, overflow+1) { 201 break 202 } 203 } 204 } 205 206 // newProfBuf returns a new profiling buffer with room for 207 // a header of hdrsize words and a buffer of at least bufwords words. 208 func newProfBuf(hdrsize, bufwords, tags int) *profBuf { 209 if min := 2 + hdrsize + 1; bufwords < min { 210 bufwords = min 211 } 212 213 // Buffer sizes must be power of two, so that we don't have to 214 // worry about uint32 wraparound changing the effective position 215 // within the buffers. We store 30 bits of count; limiting to 28 216 // gives us some room for intermediate calculations. 217 if bufwords >= 1<<28 || tags >= 1<<28 { 218 throw("newProfBuf: buffer too large") 219 } 220 var i int 221 for i = 1; i < bufwords; i <<= 1 { 222 } 223 bufwords = i 224 for i = 1; i < tags; i <<= 1 { 225 } 226 tags = i 227 228 b := new(profBuf) 229 b.hdrsize = uintptr(hdrsize) 230 b.data = make([]uint64, bufwords) 231 b.tags = make([]unsafe.Pointer, tags) 232 b.overflowBuf = make([]uint64, 2+b.hdrsize+1) 233 return b 234 } 235 236 // canWriteRecord reports whether the buffer has room 237 // for a single contiguous record with a stack of length nstk. 238 func (b *profBuf) canWriteRecord(nstk int) bool { 239 br := b.r.load() 240 bw := b.w.load() 241 242 // room for tag? 243 if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 1 { 244 return false 245 } 246 247 // room for data? 248 nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data) 249 want := 2 + int(b.hdrsize) + nstk 250 i := int(bw.dataCount() % uint32(len(b.data))) 251 if i+want > len(b.data) { 252 // Can't fit in trailing fragment of slice. 253 // Skip over that and start over at beginning of slice. 254 nd -= len(b.data) - i 255 } 256 return nd >= want 257 } 258 259 // canWriteTwoRecords reports whether the buffer has room 260 // for two records with stack lengths nstk1, nstk2, in that order. 261 // Each record must be contiguous on its own, but the two 262 // records need not be contiguous (one can be at the end of the buffer 263 // and the other can wrap around and start at the beginning of the buffer). 264 func (b *profBuf) canWriteTwoRecords(nstk1, nstk2 int) bool { 265 br := b.r.load() 266 bw := b.w.load() 267 268 // room for tag? 269 if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 2 { 270 return false 271 } 272 273 // room for data? 274 nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data) 275 276 // first record 277 want := 2 + int(b.hdrsize) + nstk1 278 i := int(bw.dataCount() % uint32(len(b.data))) 279 if i+want > len(b.data) { 280 // Can't fit in trailing fragment of slice. 281 // Skip over that and start over at beginning of slice. 282 nd -= len(b.data) - i 283 i = 0 284 } 285 i += want 286 nd -= want 287 288 // second record 289 want = 2 + int(b.hdrsize) + nstk2 290 if i+want > len(b.data) { 291 // Can't fit in trailing fragment of slice. 292 // Skip over that and start over at beginning of slice. 293 nd -= len(b.data) - i 294 i = 0 295 } 296 return nd >= want 297 } 298 299 // write writes an entry to the profiling buffer b. 300 // The entry begins with a fixed hdr, which must have 301 // length b.hdrsize, followed by a variable-sized stack 302 // and a single tag pointer *tagPtr (or nil if tagPtr is nil). 303 // No write barriers allowed because this might be called from a signal handler. 304 func (b *profBuf) write(tagPtr *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) { 305 if b == nil { 306 return 307 } 308 if len(hdr) > int(b.hdrsize) { 309 throw("misuse of profBuf.write") 310 } 311 312 if hasOverflow := b.hasOverflow(); hasOverflow && b.canWriteTwoRecords(1, len(stk)) { 313 // Room for both an overflow record and the one being written. 314 // Write the overflow record if the reader hasn't gotten to it yet. 315 // Only racing against reader, not other writers. 316 count, time := b.takeOverflow() 317 if count > 0 { 318 var stk [1]uintptr 319 stk[0] = uintptr(count) 320 b.write(nil, int64(time), nil, stk[:]) 321 } 322 } else if hasOverflow || !b.canWriteRecord(len(stk)) { 323 // Pending overflow without room to write overflow and new records 324 // or no overflow but also no room for new record. 325 b.incrementOverflow(now) 326 b.wakeupExtra() 327 return 328 } 329 330 // There's room: write the record. 331 br := b.r.load() 332 bw := b.w.load() 333 334 // Profiling tag 335 // 336 // The tag is a pointer, but we can't run a write barrier here. 337 // We have interrupted the OS-level execution of gp, but the 338 // runtime still sees gp as executing. In effect, we are running 339 // in place of the real gp. Since gp is the only goroutine that 340 // can overwrite gp.labels, the value of gp.labels is stable during 341 // this signal handler: it will still be reachable from gp when 342 // we finish executing. If a GC is in progress right now, it must 343 // keep gp.labels alive, because gp.labels is reachable from gp. 344 // If gp were to overwrite gp.labels, the deletion barrier would 345 // still shade that pointer, which would preserve it for the 346 // in-progress GC, so all is well. Any future GC will see the 347 // value we copied when scanning b.tags (heap-allocated). 348 // We arrange that the store here is always overwriting a nil, 349 // so there is no need for a deletion barrier on b.tags[wt]. 350 wt := int(bw.tagCount() % uint32(len(b.tags))) 351 if tagPtr != nil { 352 *(*uintptr)(unsafe.Pointer(&b.tags[wt])) = uintptr(unsafe.Pointer(*tagPtr)) 353 } 354 355 // Main record. 356 // It has to fit in a contiguous section of the slice, so if it doesn't fit at the end, 357 // leave a rewind marker (0) and start over at the beginning of the slice. 358 wd := int(bw.dataCount() % uint32(len(b.data))) 359 nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data) 360 skip := 0 361 if wd+2+int(b.hdrsize)+len(stk) > len(b.data) { 362 b.data[wd] = 0 363 skip = len(b.data) - wd 364 nd -= skip 365 wd = 0 366 } 367 data := b.data[wd:] 368 data[0] = uint64(2 + b.hdrsize + uintptr(len(stk))) // length 369 data[1] = uint64(now) // time stamp 370 // header, zero-padded 371 i := uintptr(copy(data[2:2+b.hdrsize], hdr)) 372 for ; i < b.hdrsize; i++ { 373 data[2+i] = 0 374 } 375 for i, pc := range stk { 376 data[2+b.hdrsize+uintptr(i)] = uint64(pc) 377 } 378 379 for { 380 // Commit write. 381 // Racing with reader setting flag bits in b.w, to avoid lost wakeups. 382 old := b.w.load() 383 new := old.addCountsAndClearFlags(skip+2+len(stk)+int(b.hdrsize), 1) 384 if !b.w.cas(old, new) { 385 continue 386 } 387 // If there was a reader, wake it up. 388 if old&profReaderSleeping != 0 { 389 notewakeup(&b.wait) 390 } 391 break 392 } 393 } 394 395 // close signals that there will be no more writes on the buffer. 396 // Once all the data has been read from the buffer, reads will return eof=true. 397 func (b *profBuf) close() { 398 if atomic.Load(&b.eof) > 0 { 399 throw("runtime: profBuf already closed") 400 } 401 atomic.Store(&b.eof, 1) 402 b.wakeupExtra() 403 } 404 405 // wakeupExtra must be called after setting one of the "extra" 406 // atomic fields b.overflow or b.eof. 407 // It records the change in b.w and wakes up the reader if needed. 408 func (b *profBuf) wakeupExtra() { 409 for { 410 old := b.w.load() 411 new := old | profWriteExtra 412 if !b.w.cas(old, new) { 413 continue 414 } 415 if old&profReaderSleeping != 0 { 416 notewakeup(&b.wait) 417 } 418 break 419 } 420 } 421 422 // profBufReadMode specifies whether to block when no data is available to read. 423 type profBufReadMode int 424 425 const ( 426 profBufBlocking profBufReadMode = iota 427 profBufNonBlocking 428 ) 429 430 var overflowTag [1]unsafe.Pointer // always nil 431 432 func (b *profBuf) read(mode profBufReadMode) (data []uint64, tags []unsafe.Pointer, eof bool) { 433 if b == nil { 434 return nil, nil, true 435 } 436 437 br := b.rNext 438 439 // Commit previous read, returning that part of the ring to the writer. 440 // First clear tags that have now been read, both to avoid holding 441 // up the memory they point at for longer than necessary 442 // and so that b.write can assume it is always overwriting 443 // nil tag entries (see comment in b.write). 444 rPrev := b.r.load() 445 if rPrev != br { 446 ntag := countSub(br.tagCount(), rPrev.tagCount()) 447 ti := int(rPrev.tagCount() % uint32(len(b.tags))) 448 for i := 0; i < ntag; i++ { 449 b.tags[ti] = nil 450 if ti++; ti == len(b.tags) { 451 ti = 0 452 } 453 } 454 b.r.store(br) 455 } 456 457 Read: 458 bw := b.w.load() 459 numData := countSub(bw.dataCount(), br.dataCount()) 460 if numData == 0 { 461 if b.hasOverflow() { 462 // No data to read, but there is overflow to report. 463 // Racing with writer flushing b.overflow into a real record. 464 count, time := b.takeOverflow() 465 if count == 0 { 466 // Lost the race, go around again. 467 goto Read 468 } 469 // Won the race, report overflow. 470 dst := b.overflowBuf 471 dst[0] = uint64(2 + b.hdrsize + 1) 472 dst[1] = uint64(time) 473 for i := uintptr(0); i < b.hdrsize; i++ { 474 dst[2+i] = 0 475 } 476 dst[2+b.hdrsize] = uint64(count) 477 return dst[:2+b.hdrsize+1], overflowTag[:1], false 478 } 479 if atomic.Load(&b.eof) > 0 { 480 // No data, no overflow, EOF set: done. 481 return nil, nil, true 482 } 483 if bw&profWriteExtra != 0 { 484 // Writer claims to have published extra information (overflow or eof). 485 // Attempt to clear notification and then check again. 486 // If we fail to clear the notification it means b.w changed, 487 // so we still need to check again. 488 b.w.cas(bw, bw&^profWriteExtra) 489 goto Read 490 } 491 492 // Nothing to read right now. 493 // Return or sleep according to mode. 494 if mode == profBufNonBlocking { 495 return nil, nil, false 496 } 497 if !b.w.cas(bw, bw|profReaderSleeping) { 498 goto Read 499 } 500 // Committed to sleeping. 501 notetsleepg(&b.wait, -1) 502 noteclear(&b.wait) 503 goto Read 504 } 505 data = b.data[br.dataCount()%uint32(len(b.data)):] 506 if len(data) > numData { 507 data = data[:numData] 508 } else { 509 numData -= len(data) // available in case of wraparound 510 } 511 skip := 0 512 if data[0] == 0 { 513 // Wraparound record. Go back to the beginning of the ring. 514 skip = len(data) 515 data = b.data 516 if len(data) > numData { 517 data = data[:numData] 518 } 519 } 520 521 ntag := countSub(bw.tagCount(), br.tagCount()) 522 if ntag == 0 { 523 throw("runtime: malformed profBuf buffer - tag and data out of sync") 524 } 525 tags = b.tags[br.tagCount()%uint32(len(b.tags)):] 526 if len(tags) > ntag { 527 tags = tags[:ntag] 528 } 529 530 // Count out whole data records until either data or tags is done. 531 // They are always in sync in the buffer, but due to an end-of-slice 532 // wraparound we might need to stop early and return the rest 533 // in the next call. 534 di := 0 535 ti := 0 536 for di < len(data) && data[di] != 0 && ti < len(tags) { 537 if uintptr(di)+uintptr(data[di]) > uintptr(len(data)) { 538 throw("runtime: malformed profBuf buffer - invalid size") 539 } 540 di += int(data[di]) 541 ti++ 542 } 543 544 // Remember how much we returned, to commit read on next call. 545 b.rNext = br.addCountsAndClearFlags(skip+di, ti) 546 547 if raceenabled { 548 // Match racereleasemerge in runtime_setProfLabel, 549 // so that the setting of the labels in runtime_setProfLabel 550 // is treated as happening before any use of the labels 551 // by our caller. The synchronization on labelSync itself is a fiction 552 // for the race detector. The actual synchronization is handled 553 // by the fact that the signal handler only reads from the current 554 // goroutine and uses atomics to write the updated queue indices, 555 // and then the read-out from the signal handler buffer uses 556 // atomics to read those queue indices. 557 raceacquire(unsafe.Pointer(&labelSync)) 558 } 559 560 return data[:di], tags[:ti], false 561 } 562