[mirror] Scalable static site server for Git forges (like GitHub Pages)
at main 410 lines 13 kB view raw
1//go:generate protoc --go_out=. --go_opt=paths=source_relative schema.proto 2 3package git_pages 4 5import ( 6 "bytes" 7 "context" 8 "crypto/sha256" 9 "errors" 10 "fmt" 11 "mime" 12 "net/http" 13 "path" 14 "path/filepath" 15 "strings" 16 "sync" 17 "time" 18 19 "github.com/c2h5oh/datasize" 20 "github.com/go-git/go-git/v6/plumbing" 21 format "github.com/go-git/go-git/v6/plumbing/format/config" 22 "github.com/klauspost/compress/zstd" 23 "github.com/prometheus/client_golang/prometheus" 24 "github.com/prometheus/client_golang/prometheus/promauto" 25 "google.golang.org/protobuf/encoding/protojson" 26 "google.golang.org/protobuf/proto" 27) 28 29var ( 30 siteCompressionSpaceSaving = promauto.NewHistogram(prometheus.HistogramOpts{ 31 Name: "git_pages_site_compression_space_saving", 32 Help: "Reduction in site size after compression relative to the uncompressed size", 33 Buckets: []float64{.01, .025, .05, .1, .25, .5, .75, 1, 1.25, 1.5, 1.75, 2, 2.5, 5, 10}, 34 35 NativeHistogramBucketFactor: 1.1, 36 NativeHistogramMaxBucketNumber: 100, 37 NativeHistogramMinResetDuration: 10 * time.Minute, 38 }) 39) 40 41func NewManifest() *Manifest { 42 return &Manifest{ 43 Contents: map[string]*Entry{ 44 "": {Type: Type_Directory.Enum()}, 45 }, 46 } 47} 48 49func IsManifestEmpty(manifest *Manifest) bool { 50 if len(manifest.Contents) > 1 { 51 return false 52 } 53 for name, entry := range manifest.Contents { 54 if name == "" && entry.GetType() == Type_Directory { 55 return true 56 } 57 } 58 panic(fmt.Errorf("malformed manifest %v", manifest)) 59} 60 61// Returns `true` if `left` and `right` contain the same files with the same types and data. 62func CompareManifest(left *Manifest, right *Manifest) bool { 63 if len(left.Contents) != len(right.Contents) { 64 return false 65 } 66 for name, leftEntry := range left.Contents { 67 rightEntry := right.Contents[name] 68 if rightEntry == nil { 69 return false 70 } 71 if leftEntry.GetType() != rightEntry.GetType() { 72 return false 73 } 74 if !bytes.Equal(leftEntry.Data, rightEntry.Data) { 75 return false 76 } 77 } 78 return true 79} 80 81func EncodeManifest(manifest *Manifest) (data []byte) { 82 data, err := proto.MarshalOptions{Deterministic: true}.Marshal(manifest) 83 if err != nil { 84 panic(err) 85 } 86 return 87} 88 89func DecodeManifest(data []byte) (manifest *Manifest, err error) { 90 manifest = &Manifest{} 91 err = proto.Unmarshal(data, manifest) 92 return 93} 94 95func NewManifestEntry(type_ Type, data []byte) *Entry { 96 entry := &Entry{} 97 entry.Type = type_.Enum() 98 if data != nil { 99 entry.Data = data 100 entry.Transform = Transform_Identity.Enum() 101 entry.OriginalSize = proto.Int64(int64(len(data))) 102 entry.CompressedSize = proto.Int64(int64(len(data))) 103 } 104 return entry 105} 106 107func AddFile(manifest *Manifest, fileName string, data []byte) *Entry { 108 // Fill in `git_hash` even for files not originating from git using the SHA256 algorithm; 109 // we use this primarily for incremental archive uploads, but when support for git SHA256 110 // repositories is complete, archive uploads and git checkouts will have cross-support for 111 // incremental updates. 112 hasher := plumbing.NewHasher(format.SHA256, plumbing.BlobObject, int64(len(data))) 113 hasher.Write(data) 114 entry := NewManifestEntry(Type_InlineFile, data) 115 entry.GitHash = proto.String(hasher.Sum().String()) 116 manifest.Contents[fileName] = entry 117 return entry 118} 119 120func AddSymlink(manifest *Manifest, fileName string, target string) *Entry { 121 if path.IsAbs(target) { 122 AddProblem(manifest, fileName, "absolute symlink: %s", target) 123 return nil 124 } else { 125 entry := NewManifestEntry(Type_Symlink, []byte(target)) 126 manifest.Contents[fileName] = entry 127 return entry 128 } 129} 130 131func AddDirectory(manifest *Manifest, dirName string) *Entry { 132 dirName = strings.TrimSuffix(dirName, "/") 133 entry := NewManifestEntry(Type_Directory, nil) 134 manifest.Contents[dirName] = entry 135 return entry 136} 137 138func AddProblem(manifest *Manifest, pathName, format string, args ...any) error { 139 cause := fmt.Sprintf(format, args...) 140 manifest.Problems = append(manifest.Problems, &Problem{ 141 Path: proto.String(pathName), 142 Cause: proto.String(cause), 143 }) 144 return fmt.Errorf("%s: %s", pathName, cause) 145} 146 147// EnsureLeadingDirectories adds directory entries for any parent directories 148// that are implicitly referenced by files in the manifest but don't have 149// explicit directory entries. (This can be the case if an archive is created 150// via globs rather than including a whole directory.) 151func EnsureLeadingDirectories(manifest *Manifest) { 152 for name := range manifest.Contents { 153 for dir := path.Dir(name); dir != "." && dir != ""; dir = path.Dir(dir) { 154 if dir == "/" { 155 panic("malformed manifest (paths must not be rooted in /)") 156 } 157 if _, exists := manifest.Contents[dir]; !exists { 158 AddDirectory(manifest, dir) 159 } 160 } 161 } 162} 163 164func GetProblemReport(manifest *Manifest) []string { 165 var report []string 166 for _, problem := range manifest.Problems { 167 report = append(report, 168 fmt.Sprintf("/%s: %s", problem.GetPath(), problem.GetCause())) 169 } 170 return report 171} 172 173func ManifestJSON(manifest *Manifest) []byte { 174 json, err := protojson.MarshalOptions{ 175 Multiline: true, 176 EmitDefaultValues: true, 177 }.Marshal(manifest) 178 if err != nil { 179 panic(err) 180 } 181 return json 182} 183 184var ErrSymlinkLoop = errors.New("symbolic link loop") 185 186func ExpandSymlinks(manifest *Manifest, inPath string) (string, error) { 187 var levels uint 188again: 189 for levels = 0; levels < config.Limits.MaxSymlinkDepth; levels += 1 { 190 parts := strings.Split(inPath, "/") 191 for i := 1; i <= len(parts); i++ { 192 linkPath := path.Join(parts[:i]...) 193 entry := manifest.Contents[linkPath] 194 if entry != nil && entry.GetType() == Type_Symlink { 195 inPath = path.Join( 196 path.Dir(linkPath), 197 string(entry.Data), 198 path.Join(parts[i:]...), 199 ) 200 continue again 201 } 202 } 203 break 204 } 205 if levels < config.Limits.MaxSymlinkDepth { 206 return inPath, nil 207 } else { 208 return "", ErrSymlinkLoop 209 } 210} 211 212// Sniff content type using the same algorithm as `http.ServeContent`. 213func DetectContentType(manifest *Manifest) { 214 for path, entry := range manifest.Contents { 215 if entry.GetType() == Type_Directory || entry.GetType() == Type_Symlink { 216 // no Content-Type 217 } else if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_Identity { 218 contentType := mime.TypeByExtension(filepath.Ext(path)) 219 if contentType == "" { 220 contentType = http.DetectContentType(entry.Data[:min(512, len(entry.Data))]) 221 } 222 entry.ContentType = proto.String(contentType) 223 } else if entry.GetContentType() == "" { 224 panic(fmt.Errorf("DetectContentType encountered invalid entry: %v, %v", 225 entry.GetType(), entry.GetTransform())) 226 } 227 } 228} 229 230// The `klauspost/compress/zstd` package recommends reusing a compressor to avoid repeated 231// allocations of internal buffers. 232var zstdEncoder, _ = zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedBetterCompression)) 233 234// Compress contents of inline files. 235func CompressFiles(ctx context.Context, manifest *Manifest) { 236 span, _ := ObserveFunction(ctx, "CompressFiles") 237 defer span.Finish() 238 239 var originalSize int64 240 var compressedSize int64 241 for _, entry := range manifest.Contents { 242 if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_Identity { 243 mediaType := getMediaType(entry.GetContentType()) 244 if strings.HasPrefix(mediaType, "video/") || strings.HasPrefix(mediaType, "audio/") { 245 continue 246 } 247 compressedData := zstdEncoder.EncodeAll(entry.GetData(), 248 make([]byte, 0, entry.GetOriginalSize())) 249 if int64(len(compressedData)) < entry.GetOriginalSize() { 250 entry.Data = compressedData 251 entry.Transform = Transform_Zstd.Enum() 252 entry.CompressedSize = proto.Int64(int64(len(entry.Data))) 253 } 254 } 255 originalSize += entry.GetOriginalSize() 256 compressedSize += entry.GetCompressedSize() 257 } 258 manifest.OriginalSize = proto.Int64(originalSize) 259 manifest.CompressedSize = proto.Int64(compressedSize) 260 261 if originalSize != 0 { 262 spaceSaving := (float64(originalSize) - float64(compressedSize)) / float64(originalSize) 263 logc.Printf(ctx, "compress: saved %.2f percent (%s to %s)", 264 spaceSaving*100.0, 265 datasize.ByteSize(originalSize).HR(), 266 datasize.ByteSize(compressedSize).HR(), 267 ) 268 siteCompressionSpaceSaving. 269 Observe(spaceSaving) 270 } 271} 272 273// Apply post-processing steps to the manifest. 274// At the moment, there isn't a good way to report errors except to log them on the terminal. 275// (Perhaps in the future they could be exposed at `.git-pages/status.txt`?) 276func PrepareManifest(ctx context.Context, manifest *Manifest) error { 277 // Parse Netlify-style `_redirects`. 278 if err := ProcessRedirectsFile(manifest); err != nil { 279 logc.Printf(ctx, "redirects err: %s\n", err) 280 } else if len(manifest.Redirects) > 0 { 281 logc.Printf(ctx, "redirects ok: %d rules\n", len(manifest.Redirects)) 282 } 283 284 // Check if any redirects are unreachable. 285 LintRedirects(manifest) 286 287 // Parse Netlify-style `_headers`. 288 if err := ProcessHeadersFile(manifest); err != nil { 289 logc.Printf(ctx, "headers err: %s\n", err) 290 } else if len(manifest.Headers) > 0 { 291 logc.Printf(ctx, "headers ok: %d rules\n", len(manifest.Headers)) 292 } 293 294 // Sniff content type like `http.ServeContent`. 295 DetectContentType(manifest) 296 297 // Opportunistically compress blobs (must be done last). 298 CompressFiles(ctx, manifest) 299 300 return nil 301} 302 303var ErrSiteTooLarge = errors.New("site too large") 304var ErrManifestTooLarge = errors.New("manifest too large") 305 306// Uploads inline file data over certain size to the storage backend. Returns a copy of 307// the manifest updated to refer to an external content-addressable store. 308func StoreManifest( 309 ctx context.Context, name string, manifest *Manifest, opts ModifyManifestOptions, 310) (*Manifest, error) { 311 span, ctx := ObserveFunction(ctx, "StoreManifest", "manifest.name", name) 312 defer span.Finish() 313 314 // Replace inline files over certain size with references to external data. 315 extManifest := Manifest{ 316 RepoUrl: manifest.RepoUrl, 317 Branch: manifest.Branch, 318 Commit: manifest.Commit, 319 Contents: make(map[string]*Entry), 320 Redirects: manifest.Redirects, 321 Headers: manifest.Headers, 322 Problems: manifest.Problems, 323 OriginalSize: manifest.OriginalSize, 324 CompressedSize: manifest.CompressedSize, 325 StoredSize: proto.Int64(0), 326 } 327 for name, entry := range manifest.Contents { 328 cannotBeInlined := entry.GetType() == Type_InlineFile && 329 entry.GetCompressedSize() > int64(config.Limits.MaxInlineFileSize.Bytes()) 330 if cannotBeInlined { 331 dataHash := sha256.Sum256(entry.Data) 332 extManifest.Contents[name] = &Entry{ 333 Type: Type_ExternalFile.Enum(), 334 OriginalSize: entry.OriginalSize, 335 CompressedSize: entry.CompressedSize, 336 Data: fmt.Appendf(nil, "sha256-%x", dataHash), 337 Transform: entry.Transform, 338 ContentType: entry.ContentType, 339 GitHash: entry.GitHash, 340 } 341 } else { 342 extManifest.Contents[name] = entry 343 } 344 } 345 346 // Compute the total and deduplicated storage size. 347 totalSize := int64(0) 348 blobSizes := map[string]int64{} 349 for _, entry := range extManifest.Contents { 350 totalSize += entry.GetOriginalSize() 351 if entry.GetType() == Type_ExternalFile { 352 blobSizes[string(entry.Data)] = entry.GetCompressedSize() 353 } 354 } 355 if uint64(totalSize) > config.Limits.MaxSiteSize.Bytes() { 356 return nil, fmt.Errorf("%w: contents size %s exceeds %s limit", 357 ErrSiteTooLarge, 358 datasize.ByteSize(totalSize).HR(), 359 config.Limits.MaxSiteSize.HR(), 360 ) 361 } 362 for _, blobSize := range blobSizes { 363 *extManifest.StoredSize += blobSize 364 } 365 366 // Upload the resulting manifest and the blob it references. 367 extManifestData := EncodeManifest(&extManifest) 368 if uint64(len(extManifestData)) > config.Limits.MaxManifestSize.Bytes() { 369 return nil, fmt.Errorf("%w: manifest size %s exceeds %s limit", 370 ErrManifestTooLarge, 371 datasize.ByteSize(len(extManifestData)).HR(), 372 config.Limits.MaxManifestSize, 373 ) 374 } 375 376 if err := backend.StageManifest(ctx, &extManifest); err != nil { 377 return nil, fmt.Errorf("stage manifest: %w", err) 378 } 379 380 wg := sync.WaitGroup{} 381 ch := make(chan error, len(extManifest.Contents)) 382 for name, entry := range extManifest.Contents { 383 // Upload external entries (those that were decided as ineligible for being stored inline). 384 // If the entry in the original manifest is already an external reference, there's no need 385 // to externalize it (and no way for us to do so, since the entry only contains the blob name). 386 if entry.GetType() == Type_ExternalFile && manifest.Contents[name].GetType() == Type_InlineFile { 387 wg.Go(func() { 388 err := backend.PutBlob(ctx, string(entry.Data), manifest.Contents[name].Data) 389 if err != nil { 390 ch <- fmt.Errorf("put blob %s: %w", name, err) 391 } 392 }) 393 } 394 } 395 wg.Wait() 396 close(ch) 397 for err := range ch { 398 return nil, err // currently ignores all but 1st error 399 } 400 401 if err := backend.CommitManifest(ctx, name, &extManifest, opts); err != nil { 402 if errors.Is(err, ErrDomainFrozen) { 403 return nil, err 404 } else { 405 return nil, fmt.Errorf("commit manifest: %w", err) 406 } 407 } 408 409 return &extManifest, nil 410}