[mirror] Scalable static site server for Git forges (like GitHub Pages)
1//go:generate protoc --go_out=. --go_opt=paths=source_relative schema.proto
2
3package git_pages
4
5import (
6 "bytes"
7 "context"
8 "crypto/sha256"
9 "errors"
10 "fmt"
11 "mime"
12 "net/http"
13 "path"
14 "path/filepath"
15 "strings"
16 "sync"
17 "time"
18
19 "github.com/c2h5oh/datasize"
20 "github.com/go-git/go-git/v6/plumbing"
21 format "github.com/go-git/go-git/v6/plumbing/format/config"
22 "github.com/klauspost/compress/zstd"
23 "github.com/prometheus/client_golang/prometheus"
24 "github.com/prometheus/client_golang/prometheus/promauto"
25 "google.golang.org/protobuf/encoding/protojson"
26 "google.golang.org/protobuf/proto"
27)
28
29var (
30 siteCompressionSpaceSaving = promauto.NewHistogram(prometheus.HistogramOpts{
31 Name: "git_pages_site_compression_space_saving",
32 Help: "Reduction in site size after compression relative to the uncompressed size",
33 Buckets: []float64{.01, .025, .05, .1, .25, .5, .75, 1, 1.25, 1.5, 1.75, 2, 2.5, 5, 10},
34
35 NativeHistogramBucketFactor: 1.1,
36 NativeHistogramMaxBucketNumber: 100,
37 NativeHistogramMinResetDuration: 10 * time.Minute,
38 })
39)
40
41func NewManifest() *Manifest {
42 return &Manifest{
43 Contents: map[string]*Entry{
44 "": {Type: Type_Directory.Enum()},
45 },
46 }
47}
48
49func IsManifestEmpty(manifest *Manifest) bool {
50 if len(manifest.Contents) > 1 {
51 return false
52 }
53 for name, entry := range manifest.Contents {
54 if name == "" && entry.GetType() == Type_Directory {
55 return true
56 }
57 }
58 panic(fmt.Errorf("malformed manifest %v", manifest))
59}
60
61// Returns `true` if `left` and `right` contain the same files with the same types and data.
62func CompareManifest(left *Manifest, right *Manifest) bool {
63 if len(left.Contents) != len(right.Contents) {
64 return false
65 }
66 for name, leftEntry := range left.Contents {
67 rightEntry := right.Contents[name]
68 if rightEntry == nil {
69 return false
70 }
71 if leftEntry.GetType() != rightEntry.GetType() {
72 return false
73 }
74 if !bytes.Equal(leftEntry.Data, rightEntry.Data) {
75 return false
76 }
77 }
78 return true
79}
80
81func EncodeManifest(manifest *Manifest) (data []byte) {
82 data, err := proto.MarshalOptions{Deterministic: true}.Marshal(manifest)
83 if err != nil {
84 panic(err)
85 }
86 return
87}
88
89func DecodeManifest(data []byte) (manifest *Manifest, err error) {
90 manifest = &Manifest{}
91 err = proto.Unmarshal(data, manifest)
92 return
93}
94
95func NewManifestEntry(type_ Type, data []byte) *Entry {
96 entry := &Entry{}
97 entry.Type = type_.Enum()
98 if data != nil {
99 entry.Data = data
100 entry.Transform = Transform_Identity.Enum()
101 entry.OriginalSize = proto.Int64(int64(len(data)))
102 entry.CompressedSize = proto.Int64(int64(len(data)))
103 }
104 return entry
105}
106
107func AddFile(manifest *Manifest, fileName string, data []byte) *Entry {
108 // Fill in `git_hash` even for files not originating from git using the SHA256 algorithm;
109 // we use this primarily for incremental archive uploads, but when support for git SHA256
110 // repositories is complete, archive uploads and git checkouts will have cross-support for
111 // incremental updates.
112 hasher := plumbing.NewHasher(format.SHA256, plumbing.BlobObject, int64(len(data)))
113 hasher.Write(data)
114 entry := NewManifestEntry(Type_InlineFile, data)
115 entry.GitHash = proto.String(hasher.Sum().String())
116 manifest.Contents[fileName] = entry
117 return entry
118}
119
120func AddSymlink(manifest *Manifest, fileName string, target string) *Entry {
121 if path.IsAbs(target) {
122 AddProblem(manifest, fileName, "absolute symlink: %s", target)
123 return nil
124 } else {
125 entry := NewManifestEntry(Type_Symlink, []byte(target))
126 manifest.Contents[fileName] = entry
127 return entry
128 }
129}
130
131func AddDirectory(manifest *Manifest, dirName string) *Entry {
132 dirName = strings.TrimSuffix(dirName, "/")
133 entry := NewManifestEntry(Type_Directory, nil)
134 manifest.Contents[dirName] = entry
135 return entry
136}
137
138func AddProblem(manifest *Manifest, pathName, format string, args ...any) error {
139 cause := fmt.Sprintf(format, args...)
140 manifest.Problems = append(manifest.Problems, &Problem{
141 Path: proto.String(pathName),
142 Cause: proto.String(cause),
143 })
144 return fmt.Errorf("%s: %s", pathName, cause)
145}
146
147// EnsureLeadingDirectories adds directory entries for any parent directories
148// that are implicitly referenced by files in the manifest but don't have
149// explicit directory entries. (This can be the case if an archive is created
150// via globs rather than including a whole directory.)
151func EnsureLeadingDirectories(manifest *Manifest) {
152 for name := range manifest.Contents {
153 for dir := path.Dir(name); dir != "." && dir != ""; dir = path.Dir(dir) {
154 if dir == "/" {
155 panic("malformed manifest (paths must not be rooted in /)")
156 }
157 if _, exists := manifest.Contents[dir]; !exists {
158 AddDirectory(manifest, dir)
159 }
160 }
161 }
162}
163
164func GetProblemReport(manifest *Manifest) []string {
165 var report []string
166 for _, problem := range manifest.Problems {
167 report = append(report,
168 fmt.Sprintf("/%s: %s", problem.GetPath(), problem.GetCause()))
169 }
170 return report
171}
172
173func ManifestJSON(manifest *Manifest) []byte {
174 json, err := protojson.MarshalOptions{
175 Multiline: true,
176 EmitDefaultValues: true,
177 }.Marshal(manifest)
178 if err != nil {
179 panic(err)
180 }
181 return json
182}
183
184var ErrSymlinkLoop = errors.New("symbolic link loop")
185
186func ExpandSymlinks(manifest *Manifest, inPath string) (string, error) {
187 var levels uint
188again:
189 for levels = 0; levels < config.Limits.MaxSymlinkDepth; levels += 1 {
190 parts := strings.Split(inPath, "/")
191 for i := 1; i <= len(parts); i++ {
192 linkPath := path.Join(parts[:i]...)
193 entry := manifest.Contents[linkPath]
194 if entry != nil && entry.GetType() == Type_Symlink {
195 inPath = path.Join(
196 path.Dir(linkPath),
197 string(entry.Data),
198 path.Join(parts[i:]...),
199 )
200 continue again
201 }
202 }
203 break
204 }
205 if levels < config.Limits.MaxSymlinkDepth {
206 return inPath, nil
207 } else {
208 return "", ErrSymlinkLoop
209 }
210}
211
212// Sniff content type using the same algorithm as `http.ServeContent`.
213func DetectContentType(manifest *Manifest) {
214 for path, entry := range manifest.Contents {
215 if entry.GetType() == Type_Directory || entry.GetType() == Type_Symlink {
216 // no Content-Type
217 } else if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_Identity {
218 contentType := mime.TypeByExtension(filepath.Ext(path))
219 if contentType == "" {
220 contentType = http.DetectContentType(entry.Data[:min(512, len(entry.Data))])
221 }
222 entry.ContentType = proto.String(contentType)
223 } else if entry.GetContentType() == "" {
224 panic(fmt.Errorf("DetectContentType encountered invalid entry: %v, %v",
225 entry.GetType(), entry.GetTransform()))
226 }
227 }
228}
229
230// The `klauspost/compress/zstd` package recommends reusing a compressor to avoid repeated
231// allocations of internal buffers.
232var zstdEncoder, _ = zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedBetterCompression))
233
234// Compress contents of inline files.
235func CompressFiles(ctx context.Context, manifest *Manifest) {
236 span, _ := ObserveFunction(ctx, "CompressFiles")
237 defer span.Finish()
238
239 var originalSize int64
240 var compressedSize int64
241 for _, entry := range manifest.Contents {
242 if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_Identity {
243 mediaType := getMediaType(entry.GetContentType())
244 if strings.HasPrefix(mediaType, "video/") || strings.HasPrefix(mediaType, "audio/") {
245 continue
246 }
247 compressedData := zstdEncoder.EncodeAll(entry.GetData(),
248 make([]byte, 0, entry.GetOriginalSize()))
249 if int64(len(compressedData)) < entry.GetOriginalSize() {
250 entry.Data = compressedData
251 entry.Transform = Transform_Zstd.Enum()
252 entry.CompressedSize = proto.Int64(int64(len(entry.Data)))
253 }
254 }
255 originalSize += entry.GetOriginalSize()
256 compressedSize += entry.GetCompressedSize()
257 }
258 manifest.OriginalSize = proto.Int64(originalSize)
259 manifest.CompressedSize = proto.Int64(compressedSize)
260
261 if originalSize != 0 {
262 spaceSaving := (float64(originalSize) - float64(compressedSize)) / float64(originalSize)
263 logc.Printf(ctx, "compress: saved %.2f percent (%s to %s)",
264 spaceSaving*100.0,
265 datasize.ByteSize(originalSize).HR(),
266 datasize.ByteSize(compressedSize).HR(),
267 )
268 siteCompressionSpaceSaving.
269 Observe(spaceSaving)
270 }
271}
272
273// Apply post-processing steps to the manifest.
274// At the moment, there isn't a good way to report errors except to log them on the terminal.
275// (Perhaps in the future they could be exposed at `.git-pages/status.txt`?)
276func PrepareManifest(ctx context.Context, manifest *Manifest) error {
277 // Parse Netlify-style `_redirects`.
278 if err := ProcessRedirectsFile(manifest); err != nil {
279 logc.Printf(ctx, "redirects err: %s\n", err)
280 } else if len(manifest.Redirects) > 0 {
281 logc.Printf(ctx, "redirects ok: %d rules\n", len(manifest.Redirects))
282 }
283
284 // Check if any redirects are unreachable.
285 LintRedirects(manifest)
286
287 // Parse Netlify-style `_headers`.
288 if err := ProcessHeadersFile(manifest); err != nil {
289 logc.Printf(ctx, "headers err: %s\n", err)
290 } else if len(manifest.Headers) > 0 {
291 logc.Printf(ctx, "headers ok: %d rules\n", len(manifest.Headers))
292 }
293
294 // Sniff content type like `http.ServeContent`.
295 DetectContentType(manifest)
296
297 // Opportunistically compress blobs (must be done last).
298 CompressFiles(ctx, manifest)
299
300 return nil
301}
302
303var ErrSiteTooLarge = errors.New("site too large")
304var ErrManifestTooLarge = errors.New("manifest too large")
305
306// Uploads inline file data over certain size to the storage backend. Returns a copy of
307// the manifest updated to refer to an external content-addressable store.
308func StoreManifest(
309 ctx context.Context, name string, manifest *Manifest, opts ModifyManifestOptions,
310) (*Manifest, error) {
311 span, ctx := ObserveFunction(ctx, "StoreManifest", "manifest.name", name)
312 defer span.Finish()
313
314 // Replace inline files over certain size with references to external data.
315 extManifest := Manifest{
316 RepoUrl: manifest.RepoUrl,
317 Branch: manifest.Branch,
318 Commit: manifest.Commit,
319 Contents: make(map[string]*Entry),
320 Redirects: manifest.Redirects,
321 Headers: manifest.Headers,
322 Problems: manifest.Problems,
323 OriginalSize: manifest.OriginalSize,
324 CompressedSize: manifest.CompressedSize,
325 StoredSize: proto.Int64(0),
326 }
327 for name, entry := range manifest.Contents {
328 cannotBeInlined := entry.GetType() == Type_InlineFile &&
329 entry.GetCompressedSize() > int64(config.Limits.MaxInlineFileSize.Bytes())
330 if cannotBeInlined {
331 dataHash := sha256.Sum256(entry.Data)
332 extManifest.Contents[name] = &Entry{
333 Type: Type_ExternalFile.Enum(),
334 OriginalSize: entry.OriginalSize,
335 CompressedSize: entry.CompressedSize,
336 Data: fmt.Appendf(nil, "sha256-%x", dataHash),
337 Transform: entry.Transform,
338 ContentType: entry.ContentType,
339 GitHash: entry.GitHash,
340 }
341 } else {
342 extManifest.Contents[name] = entry
343 }
344 }
345
346 // Compute the total and deduplicated storage size.
347 totalSize := int64(0)
348 blobSizes := map[string]int64{}
349 for _, entry := range extManifest.Contents {
350 totalSize += entry.GetOriginalSize()
351 if entry.GetType() == Type_ExternalFile {
352 blobSizes[string(entry.Data)] = entry.GetCompressedSize()
353 }
354 }
355 if uint64(totalSize) > config.Limits.MaxSiteSize.Bytes() {
356 return nil, fmt.Errorf("%w: contents size %s exceeds %s limit",
357 ErrSiteTooLarge,
358 datasize.ByteSize(totalSize).HR(),
359 config.Limits.MaxSiteSize.HR(),
360 )
361 }
362 for _, blobSize := range blobSizes {
363 *extManifest.StoredSize += blobSize
364 }
365
366 // Upload the resulting manifest and the blob it references.
367 extManifestData := EncodeManifest(&extManifest)
368 if uint64(len(extManifestData)) > config.Limits.MaxManifestSize.Bytes() {
369 return nil, fmt.Errorf("%w: manifest size %s exceeds %s limit",
370 ErrManifestTooLarge,
371 datasize.ByteSize(len(extManifestData)).HR(),
372 config.Limits.MaxManifestSize,
373 )
374 }
375
376 if err := backend.StageManifest(ctx, &extManifest); err != nil {
377 return nil, fmt.Errorf("stage manifest: %w", err)
378 }
379
380 wg := sync.WaitGroup{}
381 ch := make(chan error, len(extManifest.Contents))
382 for name, entry := range extManifest.Contents {
383 // Upload external entries (those that were decided as ineligible for being stored inline).
384 // If the entry in the original manifest is already an external reference, there's no need
385 // to externalize it (and no way for us to do so, since the entry only contains the blob name).
386 if entry.GetType() == Type_ExternalFile && manifest.Contents[name].GetType() == Type_InlineFile {
387 wg.Go(func() {
388 err := backend.PutBlob(ctx, string(entry.Data), manifest.Contents[name].Data)
389 if err != nil {
390 ch <- fmt.Errorf("put blob %s: %w", name, err)
391 }
392 })
393 }
394 }
395 wg.Wait()
396 close(ch)
397 for err := range ch {
398 return nil, err // currently ignores all but 1st error
399 }
400
401 if err := backend.CommitManifest(ctx, name, &extManifest, opts); err != nil {
402 if errors.Is(err, ErrDomainFrozen) {
403 return nil, err
404 } else {
405 return nil, fmt.Errorf("commit manifest: %w", err)
406 }
407 }
408
409 return &extManifest, nil
410}