Skip to content

Commit

Permalink
add ice package
Browse files Browse the repository at this point in the history
  • Loading branch information
gernest committed Feb 9, 2024
1 parent 8b5448a commit 41cdfbe
Show file tree
Hide file tree
Showing 34 changed files with 8,314 additions and 0 deletions.
53 changes: 53 additions & 0 deletions segment/ice/chunk.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ice

import (
"fmt"
)

const maxDocsToScanSequentially = 1024

// legacyChunkMode was the original chunk mode (always chunk size 1024)
// this mode is still used for chunking doc values.
const legacyChunkMode uint32 = 1024

const chunkModeV1 uint32 = 1025

// defaultChunkMode is the most recent improvement to chunking and should
// be used by default.
const defaultChunkMode uint32 = chunkModeV1

func getChunkSize(chunkMode uint32, cardinality, maxDocs uint64) (uint64, error) {
switch {
// any chunkMode <= 1024 will always chunk with chunkSize=chunkMode
case chunkMode <= legacyChunkMode:
// legacy chunk size
return uint64(chunkMode), nil

case chunkMode == chunkModeV1:
// the observation that the fewest number of dense chunks is the most
// desirable layout, given the built-in assumptions of chunking
// (that we want to put an upper-bound on the number of items you must
// walk over without skipping, currently tuned to 1024)
//
// 1. compute the number of chunks needed (max 1024/chunk)
// 2. convert to chunkSize, dividing into maxDocs
numChunks := (cardinality / maxDocsToScanSequentially) + 1
chunkSize := maxDocs / numChunks
return chunkSize, nil
}
return 0, fmt.Errorf("unknown chunk mode %d", chunkMode)
}
239 changes: 239 additions & 0 deletions segment/ice/contentcoder.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ice

import (
"bytes"
"encoding/binary"
"io"
)

var termSeparator byte = 0xff
var termSeparatorSplitSlice = []byte{termSeparator}

type chunkedContentCoder struct {
final []byte
chunkSize uint64
currChunk uint64
chunkLens []uint64

w io.Writer
progressiveWrite bool

chunkMetaBuf bytes.Buffer
chunkBuf bytes.Buffer

chunkMeta []metaData

compressed []byte // temp buf for compression
}

// metaData represents the data information inside a
// chunk.
type metaData struct {
DocNum uint64 // docNum of the data inside the chunk
DocDvOffset uint64 // offset of data inside the chunk for the given docid
}

// newChunkedContentCoder returns a new chunk content coder which
// packs data into chunks based on the provided chunkSize
func newChunkedContentCoder(chunkSize, maxDocNum uint64,
w io.Writer, progressiveWrite bool) *chunkedContentCoder {
total := maxDocNum/chunkSize + 1
rv := &chunkedContentCoder{
chunkSize: chunkSize,
chunkLens: make([]uint64, total),
chunkMeta: make([]metaData, 0, total),
w: w,
progressiveWrite: progressiveWrite,
}

return rv
}

// Reset lets you reuse this chunked content coder. Buffers are reset
// and re used. You cannot change the chunk size.
func (c *chunkedContentCoder) Reset() {
c.currChunk = 0
c.final = c.final[:0]
c.chunkBuf.Reset()
c.chunkMetaBuf.Reset()
for i := range c.chunkLens {
c.chunkLens[i] = 0
}
c.chunkMeta = c.chunkMeta[:0]
}

func (c *chunkedContentCoder) SetChunkSize(chunkSize, maxDocNum uint64) {
total := int(maxDocNum/chunkSize + 1)
c.chunkSize = chunkSize
if cap(c.chunkLens) < total {
c.chunkLens = make([]uint64, total)
} else {
c.chunkLens = c.chunkLens[:total]
}
if cap(c.chunkMeta) < total {
c.chunkMeta = make([]metaData, 0, total)
}
}

// Close indicates you are done calling Add() this allows
// the final chunk to be encoded.
func (c *chunkedContentCoder) Close() error {
return c.flushContents()
}

func (c *chunkedContentCoder) flushContents() error {
// flush the contents, with meta information at first
buf := make([]byte, binary.MaxVarintLen64)
n := binary.PutUvarint(buf, uint64(len(c.chunkMeta)))
_, err := c.chunkMetaBuf.Write(buf[:n])
if err != nil {
return err
}

// write out the metaData slice
diffDocNum := uint64(0)
diffDvOffset := uint64(0)
for _, meta := range c.chunkMeta {
err = writeUvarints(&c.chunkMetaBuf, meta.DocNum-diffDocNum, meta.DocDvOffset-diffDvOffset)
if err != nil {
return err
}
diffDocNum = meta.DocNum
diffDvOffset = meta.DocDvOffset
}

// write the metadata to final data
metaData := c.chunkMetaBuf.Bytes()
c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
// write the compressed data to the final data
c.compressed, err = ZSTDCompress(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes(), ZSTDCompressionLevel)
if err != nil {
return err
}
c.final = append(c.final, c.compressed...)

c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))

if c.progressiveWrite {
_, err := c.w.Write(c.final)
if err != nil {
return err
}
c.final = c.final[:0]
}

return nil
}

// Add encodes the provided byte slice into the correct chunk for the provided
// doc num. You MUST call Add() with increasing docNums.
func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
chunk := docNum / c.chunkSize
if chunk != c.currChunk {
// flush out the previous chunk details
err := c.flushContents()
if err != nil {
return err
}
// clearing the chunk specific meta for next chunk
c.chunkBuf.Reset()
c.chunkMetaBuf.Reset()
c.chunkMeta = c.chunkMeta[:0]
c.currChunk = chunk
}

// get the starting offset for this doc
dvOffset := c.chunkBuf.Len()
dvSize, err := c.chunkBuf.Write(vals)
if err != nil {
return err
}

c.chunkMeta = append(c.chunkMeta, metaData{
DocNum: docNum,
DocDvOffset: uint64(dvOffset + dvSize),
})
return nil
}

// Write commits all the encoded chunked contents to the provided writer.
//
// | ..... data ..... | chunk offsets (varints)
// | position of chunk offsets (uint64) | number of offsets (uint64) |
//
func (c *chunkedContentCoder) Write() (int, error) {
var tw int

if c.final != nil {
// write out the data section first
nw, err := c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}
}

chunkOffsetsStart := uint64(tw)

if cap(c.final) < binary.MaxVarintLen64 {
c.final = make([]byte, binary.MaxVarintLen64)
} else {
c.final = c.final[0:binary.MaxVarintLen64]
}
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
// write out the chunk offsets
for _, chunkOffset := range chunkOffsets {
n := binary.PutUvarint(c.final, chunkOffset)
nw, err := c.w.Write(c.final[:n])
tw += nw
if err != nil {
return tw, err
}
}

chunkOffsetsLen := uint64(tw) - chunkOffsetsStart

c.final = c.final[0:8]
// write out the length of chunk offsets
binary.BigEndian.PutUint64(c.final, chunkOffsetsLen)
nw, err := c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}

// write out the number of chunks
binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens)))
nw, err = c.w.Write(c.final)
tw += nw
if err != nil {
return tw, err
}

c.final = c.final[:0]

return tw, nil
}

// readDocValueBoundary elicits the start, end offsets from a
// metaData header slice
func readDocValueBoundary(chunk int, metaHeaders []metaData) (start, end uint64) {
if chunk > 0 {
start = metaHeaders[chunk-1].DocDvOffset
}
return start, metaHeaders[chunk].DocDvOffset
}
Loading

0 comments on commit 41cdfbe

Please sign in to comment.