A Go implementation of Facebook's PDQ
trust-and-safety pdq

initial commit

hailey.at 74e13c25

+1239
+28
.gitignore
··· 1 + # Test data (downloaded images for benchmarking) 2 + testdata/ 3 + 4 + # Go build artifacts 5 + *.exe 6 + *.exe~ 7 + *.dll 8 + *.so 9 + *.dylib 10 + 11 + # Go test cache 12 + *.test 13 + *.out 14 + 15 + # Binaries 16 + /pdqhasher 17 + /helper 18 + 19 + # IDE 20 + .idea/ 21 + .vscode/ 22 + *.swp 23 + *.swo 24 + *~ 25 + 26 + # OS 27 + .DS_Store 28 + Thumbs.db
+21
LICENSE
··· 1 + MIT License 2 + 3 + Copyright (c) 2026 me@haileyok.com 4 + 5 + Permission is hereby granted, free of charge, to any person obtaining a copy 6 + of this software and associated documentation files (the "Software"), to deal 7 + in the Software without restriction, including without limitation the rights 8 + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 + copies of the Software, and to permit persons to whom the Software is 10 + furnished to do so, subject to the following conditions: 11 + 12 + The above copyright notice and this permission notice shall be included in all 13 + copies or substantial portions of the Software. 14 + 15 + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 + SOFTWARE.
+132
README.md
··· 1 + # gopdq 2 + 3 + A Go implementation of [Meta's PDQ](https://github.com/facebook/ThreatExchange/tree/main/pdq) perceptual hashing algorithm. 4 + 5 + PDQ is a perceptual hashing algorithm designed to identify visually similar images. It generates a compact 256-bit hash that remains stable across common image transformations like resizing, compression, and minor edits. 6 + 7 + 8 + ## Installation 9 + 10 + ```bash 11 + go get github.com/haileyok/gopdq 12 + ``` 13 + 14 + ## Usage 15 + 16 + There are two different functions provided in this package: `HashFromFile` and `HashFromImage`. While either will work, you should ensure that the input image has been resized to a size no greater than 512x512. See 17 + [the PDQ paper](https://github.com/facebook/ThreatExchange/blob/main/hashing/hashing.pdf). 18 + 19 + > Using two-pass Jarosz filters (i.e. tent convolutions), compute a weighted average of 64x64 subblocks of 20 + the luminance image. (This is prohibitively time-consuming for megapixel input so we recommend using an 21 + off-the-shelf technique to first resize to 512x512 before converting from RGB to luminance.) 22 + 23 + For conveneicne, there is a helper method `helpers.ResizeIfNeeded(img image.Image)` which will return a resized `image.Image` that can be passed to `HashFromImage`. 24 + 25 + 26 + ```go 27 + package main 28 + 29 + import ( 30 + "fmt" 31 + "log" 32 + 33 + "github.com/haileyok/gopdq" 34 + ) 35 + 36 + func main() { 37 + // Hash an image file, assuming it has already been resized. 38 + // NOTE: There is no logic that _guarantees_ an image has been resized, this is up to you to ensure. 39 + result, err := pdq.HashFromFile("image.jpg") 40 + if err != nil { 41 + log.Fatal(err) 42 + } 43 + 44 + fmt.Printf("Hash: %s\n", result.Hash) 45 + fmt.Printf("Quality: %d\n", result.Quality) 46 + } 47 + ``` 48 + 49 + ### Using with pre-loaded images 50 + 51 + ```go 52 + import ( 53 + "image" 54 + _ "image/jpeg" 55 + 56 + "github.com/haileyok/gopdq" 57 + "github.com/haileyok/gopdq/helpers" 58 + ) 59 + 60 + func main() { 61 + // Open the image and decode it 62 + file, _ := os.Open("image.jpg") 63 + img, _, _ := image.Decode(file) 64 + 65 + // Resize if needed 66 + img = helpers.ResizeIfNeeded(img) 67 + 68 + // Generate hash 69 + result, _ := pdq.HashFromImage(img) 70 + fmt.Println(result.Hash) 71 + } 72 + ``` 73 + 74 + ### HashResult 75 + 76 + Both of the above functions will return a `HashResult`, which includes both the hash and the quality score. 77 + 78 + ```go 79 + type HashResult struct { 80 + Hash string 81 + Quality int // Results with a quality score < 50 should be discarded 82 + ImageHeightTimesWidth int 83 + HashDuration time.Duration 84 + } 85 + ``` 86 + 87 + ## Command Line Tools 88 + 89 + ### PDQ Hasher 90 + 91 + ```bash 92 + # Build the hasher 93 + go build ./cmd/pdqhasher 94 + 95 + # Hash an image 96 + ./pdqhasher path/to/image.jpg 97 + 98 + # Output: 99 + # Hash: e77b19ca5399466258c656bc4666a7853939a567a9193939e667199856ccc6c6 100 + # Quality: 100 101 + # Binary: 1110011110110001000110011010010100110011100110010100011001100010... 102 + ``` 103 + 104 + ### Hamming Distance Helper 105 + 106 + ```bash 107 + # Build the helper 108 + go build ./cmd/helper 109 + 110 + # Calculate hamming distance 111 + ./helper hamming <hash1> <hash2> 112 + 113 + # Output: 114 + # 8 115 + ``` 116 + 117 + ## About Distance 118 + 119 + Please see https://github.com/facebook/ThreatExchange/tree/main/pdq#matching 120 + 121 + Note that outputs from the C++ implementation's example binary and the `pdqhasher` binary provided here may not return hashes that are exactly the same due to 122 + differences in resizing libraries. This is expected, see https://github.com/facebook/ThreatExchange/tree/main/pdq#hashing. 123 + 124 + ## References 125 + 126 + - [PDQ Algorithm (C++ Reference)](https://github.com/facebook/ThreatExchange/tree/main/pdq) 127 + - [PDQ Hashing Paper](https://github.com/facebook/ThreatExchange/blob/main/hashing/hashing.pdf) 128 + - [ThreatExchange](https://github.com/facebook/ThreatExchange) 129 + 130 + ## Acknowledgments 131 + 132 + This is a Go implementation of Meta's PDQ algorithm. All credit for the algorithm design goes to the original authors.
+306
cmd/benchmark/main.go
··· 1 + // slop code that seems to work fine 2 + 3 + package main 4 + 5 + import ( 6 + "context" 7 + "fmt" 8 + "image" 9 + _ "image/gif" 10 + _ "image/jpeg" 11 + _ "image/png" 12 + "log" 13 + "net/http" 14 + "os" 15 + "path/filepath" 16 + "runtime" 17 + "strings" 18 + "sync" 19 + "sync/atomic" 20 + "time" 21 + 22 + _ "net/http/pprof" 23 + 24 + pdq "github.com/haileyok/gopdq" 25 + "github.com/haileyok/gopdq/helpers" 26 + "github.com/urfave/cli/v3" 27 + _ "golang.org/x/image/bmp" 28 + _ "golang.org/x/image/tiff" 29 + _ "golang.org/x/image/webp" 30 + ) 31 + 32 + func main() { 33 + app := cli.Command{ 34 + Name: "benchmark", 35 + Usage: "Measure PDQ hashing throughput", 36 + Flags: []cli.Flag{ 37 + &cli.StringFlag{ 38 + Name: "dir", 39 + Aliases: []string{"d"}, 40 + Value: "testdata/images", 41 + Usage: "Directory containing test images", 42 + }, 43 + &cli.IntFlag{ 44 + Name: "duration", 45 + Aliases: []string{"t"}, 46 + Value: 10, 47 + Usage: "Duration in seconds to run the benchmark", 48 + }, 49 + &cli.IntFlag{ 50 + Name: "workers", 51 + Aliases: []string{"w"}, 52 + Value: 1, 53 + Usage: "Number of parallel workers (0 = auto = num CPUs)", 54 + }, 55 + &cli.BoolFlag{ 56 + Name: "with-resize", 57 + Aliases: []string{"r"}, 58 + Value: true, 59 + Usage: "Include resize in benchmark", 60 + }, 61 + &cli.BoolFlag{ 62 + Name: "with-io", 63 + Aliases: []string{"i"}, 64 + Value: false, 65 + Usage: "Include file I/O in benchmark (slower)", 66 + }, 67 + &cli.StringFlag{ 68 + Name: "metrics-addr", 69 + Value: ":6009", 70 + Usage: "Address for pprof server", 71 + }, 72 + }, 73 + Action: runBenchmark, 74 + } 75 + 76 + if err := app.Run(context.Background(), os.Args); err != nil { 77 + log.Fatal(err) 78 + } 79 + } 80 + 81 + func runBenchmark(ctx context.Context, cmd *cli.Command) error { 82 + metricsServer := http.DefaultServeMux 83 + go func() { 84 + if err := http.ListenAndServe(cmd.String("metrics-addr"), metricsServer); err != nil { 85 + log.Fatal(err) 86 + } 87 + }() 88 + 89 + imageDir := cmd.String("dir") 90 + duration := time.Duration(cmd.Int("duration")) * time.Second 91 + numWorkers := cmd.Int("workers") 92 + withResize := cmd.Bool("with-resize") 93 + withIO := cmd.Bool("with-io") 94 + 95 + if numWorkers == 0 { 96 + numWorkers = 8 97 + } 98 + 99 + cpuInfo := getCPUInfo() 100 + 101 + fmt.Printf("PDQ Hashing Throughput Benchmark\n") 102 + fmt.Printf("=================================\n\n") 103 + fmt.Printf("CPU: %s\n", cpuInfo) 104 + fmt.Printf("CPU Cores: %d\n", runtime.NumCPU()) 105 + fmt.Printf("Image Directory: %s\n", imageDir) 106 + fmt.Printf("Duration: %v\n", duration) 107 + fmt.Printf("Workers: %d\n", numWorkers) 108 + fmt.Printf("With Resize: %v\n", withResize) 109 + fmt.Printf("With I/O: %v\n\n", withIO) 110 + 111 + type imageWithSize struct { 112 + img image.Image 113 + originalSize int // max dimension before resize 114 + } 115 + 116 + var imagePaths []string 117 + var preloadedImages []imageWithSize 118 + 119 + fmt.Print("Loading test images... ") 120 + if withIO { 121 + var err error 122 + imagePaths, err = loadImagePaths(imageDir) 123 + if err != nil { 124 + return fmt.Errorf("failed to load images: %w", err) 125 + } 126 + fmt.Printf("%d images found\n\n", len(imagePaths)) 127 + } else { 128 + var err error 129 + imagePaths, err = loadImagePaths(imageDir) 130 + if err != nil { 131 + return fmt.Errorf("failed to load images: %w", err) 132 + } 133 + 134 + for _, path := range imagePaths { 135 + file, err := os.Open(path) 136 + if err != nil { 137 + continue 138 + } 139 + img, _, err := image.Decode(file) 140 + file.Close() 141 + if err != nil { 142 + continue 143 + } 144 + 145 + // Store original size before resize 146 + bounds := img.Bounds() 147 + originalSize := max(bounds.Dx(), bounds.Dy()) 148 + 149 + if withResize { 150 + img = helpers.ResizeIfNeeded(img) 151 + } 152 + 153 + preloadedImages = append(preloadedImages, imageWithSize{ 154 + img: img, 155 + originalSize: originalSize, 156 + }) 157 + } 158 + fmt.Printf("%d images loaded and decoded\n\n", len(preloadedImages)) 159 + } 160 + 161 + if len(imagePaths) == 0 && len(preloadedImages) == 0 { 162 + return fmt.Errorf("no images found in %s. Run setup_testdata.py to download test images", imageDir) 163 + } 164 + 165 + // Run benchmark 166 + fmt.Println("Starting benchmark...") 167 + fmt.Println() 168 + 169 + var hashCount atomic.Int64 170 + var errorCount atomic.Int64 171 + startTime := time.Now() 172 + stopTime := startTime.Add(duration) 173 + 174 + var wg sync.WaitGroup 175 + for w := 0; w < numWorkers; w++ { 176 + wg.Add(1) 177 + go func(workerID int) { 178 + defer wg.Done() 179 + 180 + idx := 0 181 + for time.Now().Before(stopTime) { 182 + var err error 183 + 184 + if withIO { 185 + // Hash from file 186 + imagePath := imagePaths[idx%len(imagePaths)] 187 + _, err = pdq.HashFromFile(imagePath) 188 + } else { 189 + // Hash from pre-decoded image 190 + imgWithSize := preloadedImages[idx%len(preloadedImages)] 191 + _, err = pdq.HashFromImage(imgWithSize.img) 192 + } 193 + 194 + if err != nil { 195 + errorCount.Add(1) 196 + } else { 197 + hashCount.Add(1) 198 + } 199 + idx++ 200 + } 201 + }(w) 202 + } 203 + 204 + // Progress reporting 205 + go func() { 206 + ticker := time.NewTicker(time.Second) 207 + defer ticker.Stop() 208 + 209 + for time.Now().Before(stopTime) { 210 + <-ticker.C 211 + elapsed := time.Since(startTime) 212 + count := hashCount.Load() 213 + rate := float64(count) / elapsed.Seconds() 214 + fmt.Printf("\rElapsed: %5.1fs | Hashes: %7d | Rate: %8.1f hashes/sec", elapsed.Seconds(), count, rate) 215 + } 216 + }() 217 + 218 + // Wait for workers 219 + wg.Wait() 220 + 221 + elapsed := time.Since(startTime) 222 + totalHashes := hashCount.Load() 223 + totalErrors := errorCount.Load() 224 + hashesPerSecond := float64(totalHashes) / elapsed.Seconds() 225 + 226 + // Final results 227 + fmt.Printf("\n\n") 228 + fmt.Printf("Results\n") 229 + fmt.Printf("=======\n\n") 230 + fmt.Printf("Total Time: %v\n", elapsed) 231 + fmt.Printf("Total Hashes: %d\n", totalHashes) 232 + fmt.Printf("Errors: %d\n", totalErrors) 233 + fmt.Printf("\n") 234 + fmt.Printf("Throughput: %.1f hashes/sec\n", hashesPerSecond) 235 + fmt.Printf("Avg Time/Hash: %.2f ms\n", 1000.0/hashesPerSecond) 236 + fmt.Printf("\n") 237 + 238 + // Per-worker stats 239 + hashesPerWorker := float64(totalHashes) / float64(numWorkers) 240 + fmt.Printf("Per Worker: %.1f hashes\n", hashesPerWorker) 241 + fmt.Printf("Per Worker/Sec: %.1f hashes/sec\n", hashesPerWorker/elapsed.Seconds()) 242 + 243 + // Size breakdown if we have preloaded images 244 + if !withIO && len(preloadedImages) > 0 { 245 + fmt.Printf("\n") 246 + fmt.Printf("Image Size Breakdown (Original Sizes)\n") 247 + fmt.Printf("======================================\n\n") 248 + 249 + var small, medium, large int 250 + for _, imgWithSize := range preloadedImages { 251 + maxDim := imgWithSize.originalSize 252 + if maxDim <= 512 { 253 + small++ 254 + } else if maxDim <= 1024 { 255 + medium++ 256 + } else { 257 + large++ 258 + } 259 + } 260 + 261 + fmt.Printf("Small (≤512): %d (%.1f%%)\n", small, float64(small)/float64(len(preloadedImages))*100) 262 + fmt.Printf("Medium (513-1024): %d (%.1f%%)\n", medium, float64(medium)/float64(len(preloadedImages))*100) 263 + fmt.Printf("Large (>1024): %d (%.1f%%)\n", large, float64(large)/float64(len(preloadedImages))*100) 264 + } 265 + 266 + return nil 267 + } 268 + 269 + func loadImagePaths(dir string) ([]string, error) { 270 + var images []string 271 + err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 272 + if err != nil { 273 + return nil 274 + } 275 + if info.IsDir() { 276 + return nil 277 + } 278 + 279 + ext := filepath.Ext(path) 280 + switch ext { 281 + case ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp": 282 + images = append(images, path) 283 + } 284 + return nil 285 + }) 286 + 287 + return images, err 288 + } 289 + 290 + func getCPUInfo() string { 291 + // Try to read CPU info from /proc/cpuinfo (Linux) 292 + if data, err := os.ReadFile("/proc/cpuinfo"); err == nil { 293 + lines := strings.Split(string(data), "\n") 294 + for _, line := range lines { 295 + if strings.HasPrefix(line, "model name") { 296 + parts := strings.SplitN(line, ":", 2) 297 + if len(parts) == 2 { 298 + return strings.TrimSpace(parts[1]) 299 + } 300 + } 301 + } 302 + } 303 + 304 + // Fallback to architecture info 305 + return fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH) 306 + }
+47
cmd/helper/main.go
··· 1 + package main 2 + 3 + import ( 4 + "context" 5 + "fmt" 6 + "log" 7 + "os" 8 + 9 + "github.com/haileyok/gopdq/helpers" 10 + "github.com/urfave/cli/v3" 11 + ) 12 + 13 + func main() { 14 + app := cli.Command{ 15 + Name: "pdq-helper", 16 + Commands: []*cli.Command{ 17 + { 18 + Name: "hamming", 19 + Arguments: []cli.Argument{ 20 + &cli.StringArg{ 21 + Name: "hash-1", 22 + }, 23 + &cli.StringArg{ 24 + Name: "hash-2", 25 + }, 26 + }, 27 + Action: func(ctx context.Context, cmd *cli.Command) error { 28 + hashOne := cmd.StringArg("hash-1") 29 + hashTwo := cmd.StringArg("hash-2") 30 + 31 + distance, err := helpers.HammingDistance(hashOne, hashTwo) 32 + if err != nil { 33 + return fmt.Errorf("failed to get distance between two input hashes: %w", err) 34 + } 35 + 36 + fmt.Println(distance) 37 + 38 + return nil 39 + }, 40 + }, 41 + }, 42 + } 43 + 44 + if err := app.Run(context.Background(), os.Args); err != nil { 45 + log.Fatal(err) 46 + } 47 + }
+63
cmd/pdqhasher/main.go
··· 1 + package main 2 + 3 + import ( 4 + "context" 5 + "fmt" 6 + "image" 7 + "log" 8 + "os" 9 + "time" 10 + 11 + pdq "github.com/haileyok/gopdq" 12 + "github.com/haileyok/gopdq/helpers" 13 + "github.com/urfave/cli/v3" 14 + ) 15 + 16 + func main() { 17 + app := cli.Command{ 18 + Name: "pdqhasher", 19 + Arguments: []cli.Argument{ 20 + &cli.StringArg{ 21 + Name: "input-file", 22 + UsageText: "path to input file to get a hash from", 23 + }, 24 + }, 25 + Action: func(ctx context.Context, cmd *cli.Command) error { 26 + fileName := cmd.StringArg("input-file") 27 + 28 + readStart := time.Now() 29 + file, err := os.Open(fileName) 30 + if err != nil { 31 + return fmt.Errorf("failed to open file at %s: %w", fileName, err) 32 + } 33 + defer file.Close() 34 + 35 + // decode the image so we can resize if needed 36 + img, _, err := image.Decode(file) 37 + 38 + readDuration := time.Since(readStart) 39 + 40 + resizeStart := time.Now() 41 + // resize the image if needed, since the implementation does not do resizing for you 42 + img = helpers.ResizeIfNeeded(img) 43 + resizeDuration := time.Since(resizeStart) 44 + 45 + // create a hash from the image 46 + hash, err := pdq.HashFromImage(img) 47 + if err != nil { 48 + return fmt.Errorf("failed to hash input image: %w", err) 49 + } 50 + 51 + binary, _ := helpers.PdqHashToBinary(hash.Hash) 52 + 53 + // return the hash and the quality 54 + fmt.Printf("\nHash: %s\nQuality: %d\nBinary: %s\n\nRead Microseconds: %d\nResize Microseconds: %d\nHash Microseconds: %d\n", hash.Hash, hash.Quality, binary, readDuration.Microseconds(), resizeDuration.Microseconds(), hash.HashDuration.Microseconds()) 55 + 56 + return nil 57 + }, 58 + } 59 + 60 + if err := app.Run(context.Background(), os.Args); err != nil { 61 + log.Fatal(err) 62 + } 63 + }
+90
downscaling.go
··· 1 + // Reimplementation of https://github.com/facebook/ThreatExchange/blob/main/pdq in Golang 2 + // 3 + // For reference, please see https://github.com/facebook/ThreatExchange/blob/main/hashing/hashing.pdf 4 + // 5 + // Function names are similar or the same as those in the reference C++ implementation, and 6 + // any questions about implementation should reference that code. 7 + // 8 + // Downscaling reference found at https://github.com/facebook/ThreatExchange/blob/main/pdq/cpp/downscaling/downscaling.cpp 9 + 10 + package pdq 11 + 12 + func computeJaroszFilterWindowSize(oldDimension, newDimension int) int { 13 + return (oldDimension + 2*newDimension - 1) / (2 * newDimension) 14 + } 15 + 16 + func jaroszFilterFloat(buffer1 []float32, buffer2 []float32, numRows, numCols, windowSizeAlongRows, windowSizeAlongCols, nreps int) { 17 + for range nreps { 18 + boxAlongRowsFloat(buffer1, buffer2, numRows, numCols, windowSizeAlongRows) 19 + boxAlongColsFloat(buffer2, buffer1, numRows, numCols, windowSizeAlongCols) 20 + } 21 + } 22 + 23 + func boxAlongRowsFloat(inVector []float32, outVector []float32, numRows, numCols, windowSize int) { 24 + for i := range numRows { 25 + offset := i * numCols 26 + box1DFloat(inVector[offset:], outVector[offset:], numCols, 1, windowSize) 27 + } 28 + } 29 + 30 + func boxAlongColsFloat(inVector []float32, outVector []float32, numRows, numCols, windowSize int) { 31 + for j := range numCols { 32 + box1DFloat(inVector[j:], outVector[j:], numRows, numCols, windowSize) 33 + } 34 + } 35 + 36 + func box1DFloat(inVector []float32, outVector []float32, vectorLength, stride, fullWindowSize int) { 37 + halfWindowSize := (fullWindowSize + 2) / 2 38 + 39 + phase1Nreps := halfWindowSize - 1 40 + phase2Nreps := fullWindowSize - halfWindowSize + 1 41 + phase3Nreps := vectorLength - fullWindowSize 42 + phase4Nreps := halfWindowSize - 1 43 + 44 + var li, ri, oi int 45 + var sum float32 46 + var currentWindowSize int 47 + 48 + for range phase1Nreps { 49 + sum += inVector[ri] 50 + currentWindowSize++ 51 + ri += stride 52 + } 53 + 54 + for range phase2Nreps { 55 + sum += inVector[ri] 56 + currentWindowSize++ 57 + outVector[oi] = sum / float32(currentWindowSize) 58 + ri += stride 59 + oi += stride 60 + } 61 + 62 + invWindowSize := 1.0 / float32(currentWindowSize) 63 + for range phase3Nreps { 64 + sum += inVector[ri] 65 + sum -= inVector[li] 66 + outVector[oi] = sum * invWindowSize 67 + li += stride 68 + ri += stride 69 + oi += stride 70 + } 71 + 72 + for range phase4Nreps { 73 + sum -= inVector[li] 74 + currentWindowSize-- 75 + outVector[oi] = sum / float32(currentWindowSize) 76 + li += stride 77 + oi += stride 78 + } 79 + } 80 + 81 + func decimateFloat(in []float32, inNumRows, inNumCols int, out []float32, outNumRows, outNumCols int) { 82 + // target centers not corners: 83 + for outRow := range outNumRows { 84 + inRow := int(((float32(outRow) + 0.5) * float32(inNumRows)) / float32(outNumRows)) 85 + for outCol := range outNumCols { 86 + inCol := int(((float32(outCol) + 0.5) * float32(inNumCols)) / float32(outNumCols)) 87 + out[outRow*outNumCols+outCol] = in[inRow*inNumCols+inCol] 88 + } 89 + } 90 + }
+23
go.mod
··· 1 + module github.com/haileyok/gopdq 2 + 3 + go 1.25.5 4 + 5 + require ( 6 + github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 7 + github.com/prometheus/client_golang v1.23.2 8 + github.com/urfave/cli/v3 v3.6.1 9 + golang.org/x/image v0.35.0 10 + ) 11 + 12 + require ( 13 + github.com/beorn7/perks v1.0.1 // indirect 14 + github.com/cespare/xxhash/v2 v2.3.0 // indirect 15 + github.com/kr/text v0.2.0 // indirect 16 + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 17 + github.com/prometheus/client_model v0.6.2 // indirect 18 + github.com/prometheus/common v0.66.1 // indirect 19 + github.com/prometheus/procfs v0.16.1 // indirect 20 + go.yaml.in/yaml/v2 v2.4.2 // indirect 21 + golang.org/x/sys v0.35.0 // indirect 22 + google.golang.org/protobuf v1.36.9 // indirect 23 + )
+52
go.sum
··· 1 + github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= 2 + github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= 3 + github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 4 + github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 5 + github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 6 + github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 7 + github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 8 + github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= 9 + github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= 10 + github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= 11 + github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= 12 + github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 13 + github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 14 + github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 15 + github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 16 + github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= 17 + github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= 18 + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= 19 + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= 20 + github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ= 21 + github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8= 22 + github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 23 + github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 24 + github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= 25 + github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= 26 + github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= 27 + github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= 28 + github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= 29 + github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= 30 + github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= 31 + github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= 32 + github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= 33 + github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= 34 + github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= 35 + github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= 36 + github.com/urfave/cli/v3 v3.6.1 h1:j8Qq8NyUawj/7rTYdBGrxcH7A/j7/G8Q5LhWEW4G3Mo= 37 + github.com/urfave/cli/v3 v3.6.1/go.mod h1:ysVLtOEmg2tOy6PknnYVhDoouyC/6N42TMeoMzskhso= 38 + go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= 39 + go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= 40 + go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= 41 + go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= 42 + golang.org/x/image v0.35.0 h1:LKjiHdgMtO8z7Fh18nGY6KDcoEtVfsgLDPeLyguqb7I= 43 + golang.org/x/image v0.35.0/go.mod h1:MwPLTVgvxSASsxdLzKrl8BRFuyqMyGhLwmC+TO1Sybk= 44 + golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= 45 + golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 46 + google.golang.org/protobuf v1.36.9 h1:w2gp2mA27hUeUzj9Ex9FBjsBm40zfaDtEWow293U7Iw= 47 + google.golang.org/protobuf v1.36.9/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= 48 + gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 49 + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 50 + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 51 + gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 52 + gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+58
helpers/binary.go
··· 1 + package helpers 2 + 3 + import ( 4 + "encoding/hex" 5 + "fmt" 6 + "math/bits" 7 + ) 8 + 9 + // Converts a 64-character hexidecimal PDQ hash into a 256-character binary string 10 + // representation, useful for inserting into some vector stores. 11 + func PdqHashToBinary(input string) (string, error) { 12 + hashb, err := hex.DecodeString(input) 13 + if err != nil { 14 + return "", err 15 + } 16 + 17 + result := make([]byte, len(hashb)*8) 18 + for i, b := range hashb { 19 + for j := 7; j >= 0; j-- { 20 + if (b>>j)&1 == 1 { 21 + result[i*8+(7-j)] = '1' 22 + } else { 23 + result[i*8+(7-j)] = '0' 24 + } 25 + } 26 + } 27 + 28 + return string(result), nil 29 + } 30 + 31 + // Calculate the hamming distance between two PDQ hashes. Input hashes should be 64-character 32 + // hexidecimal strings. Returns a value between 0 (identical) and 256 (completely different). 33 + func HammingDistance(hashOne, hashTwo string) (int, error) { 34 + bytes1, err := hex.DecodeString(hashOne) 35 + if err != nil { 36 + return 0, fmt.Errorf("invalid hash1: %w", err) 37 + } 38 + 39 + bytes2, err := hex.DecodeString(hashTwo) 40 + if err != nil { 41 + return 0, fmt.Errorf("invalid hash2: %w", err) 42 + } 43 + 44 + if len(bytes1) != 32 { 45 + return 0, fmt.Errorf("first hash has invalid length: expected 32 bytes, got %d", len(bytes1)) 46 + } 47 + if len(bytes2) != 32 { 48 + return 0, fmt.Errorf("second hash has invalid length: expected 32 bytes, got %d", len(bytes2)) 49 + } 50 + 51 + distance := 0 52 + for i := range 32 { 53 + xor := bytes1[i] ^ bytes2[i] 54 + distance += bits.OnesCount8(xor) 55 + } 56 + 57 + return distance, nil 58 + }
+29
helpers/resize.go
··· 1 + package helpers 2 + 3 + import ( 4 + "image" 5 + _ "image/gif" 6 + _ "image/jpeg" 7 + _ "image/png" 8 + 9 + pdq "github.com/haileyok/gopdq" 10 + "github.com/nfnt/resize" 11 + _ "golang.org/x/image/bmp" 12 + _ "golang.org/x/image/tiff" 13 + _ "golang.org/x/image/webp" 14 + ) 15 + 16 + func ResizeIfNeeded(img image.Image) image.Image { 17 + size := img.Bounds().Size() 18 + 19 + if size.X > pdq.DownsampleDims || size.Y > pdq.DownsampleDims { 20 + // SEE: https://github.com/facebook/ThreatExchange/blob/main/pdq/cpp/io/pdqio.cpp#L103 21 + // we use NearestNeighbor here as the PDQ uses that algo as well (unspecified parameter 22 + // which defaults to nearest neighbor, see https://cimg.eu/reference/structcimg__library_1_1CImg.html) 23 + // even still, because the two libraries have different implementations, we'll still see 24 + // minor differences in output. that is expected. see "More on Downsampling" in hashing.pdf 25 + return resize.Resize(pdq.DownsampleDims, pdq.DownsampleDims, img, resize.NearestNeighbor) 26 + } 27 + 28 + return img 29 + }
+344
pdq.go
··· 1 + // Reimplementation of https://github.com/facebook/ThreatExchange/blob/main/pdq in Golang 2 + // 3 + // For reference, please see https://github.com/facebook/ThreatExchange/blob/main/hashing/hashing.pdf 4 + // 5 + // Function names are similar or the same as those in the reference C++ implementation, and 6 + // any questions about implementation should reference that code. 7 + 8 + package pdq 9 + 10 + import ( 11 + "errors" 12 + "fmt" 13 + "image" 14 + _ "image/gif" 15 + _ "image/jpeg" 16 + _ "image/png" 17 + "math" 18 + "os" 19 + "time" 20 + 21 + _ "golang.org/x/image/bmp" 22 + _ "golang.org/x/image/tiff" 23 + _ "golang.org/x/image/webp" 24 + ) 25 + 26 + // HashResult contains the output of a PDQ hash operation 27 + type HashResult struct { 28 + Hash string 29 + Quality int 30 + ImageHeightTimesWidth int 31 + HashDuration time.Duration 32 + } 33 + 34 + // Various constants pulled from the reference implementation 35 + const ( 36 + LumaFromRCoeff = 0.299 37 + LumaFromGCoeff = 0.587 38 + LumaFromBCoeff = 0.114 39 + 40 + PdqNumJaroszXYPasses = 2 41 + 42 + DownsampleDims = 512 43 + 44 + MinHashableDim = 5 45 + ) 46 + 47 + var ( 48 + ErrInvalidFile = errors.New("invalid input file name") 49 + ) 50 + 51 + // SEE: https://github.com/facebook/ThreatExchange/blob/main/pdq/cpp/hashing/pdqhashing.cpp#L42 52 + var dctMatrix64 []float32 53 + 54 + func init() { 55 + const numRows = 16 56 + const numCols = 64 57 + 58 + matrixScaleFactor := math.Sqrt(2.0 / float64(numCols)) 59 + 60 + dctMatrix64 = make([]float32, numRows*numCols) 61 + 62 + for i := range numRows { 63 + for j := range numCols { 64 + dctMatrix64[i*numCols+j] = float32(matrixScaleFactor * math.Cos((math.Pi/2.0/float64(numCols))*float64(i+1)*float64(2*j+1))) 65 + } 66 + } 67 + } 68 + 69 + // HashFromImage generates a PDQ hash from an image.Image 70 + // The image should idealy be pre-resizes to 512x512 or smaller for performance reasons. 71 + // SEE: https://github.com/facebook/ThreatExchange/blob/main/hashing/hashing.pdf, "More on Downsampling" 72 + // Returns a HashResult containing the hash and a quality score between 0 and 100. 73 + // Please reference the evaluation data for selecting a good quality score. From hashing.pdf: 74 + // "Confident-match distances are up to the system designer, of course, but 30, 20, or less has been found to 75 + // produce good results on evaluation data." 76 + func HashFromImage(img image.Image) (*HashResult, error) { 77 + bounds := img.Bounds() 78 + size := bounds.Size() 79 + 80 + imageHeightTimesWidth := size.Y * size.X 81 + 82 + luma, numRows, numCols := loadFloatLumaFromImage(img) 83 + 84 + fullBuffer2 := make([]float32, numRows*numCols) 85 + 86 + hashStart := time.Now() 87 + hash, quality := hash256FromFloatLuma(luma, fullBuffer2, numRows, numCols) 88 + hashTime := time.Since(hashStart) 89 + 90 + return &HashResult{ 91 + Hash: hash, 92 + Quality: quality, 93 + ImageHeightTimesWidth: imageHeightTimesWidth, 94 + HashDuration: hashTime, 95 + }, nil 96 + } 97 + 98 + // Opens a file at the specified file and uses image.Image to decode the image. Returns the result of 99 + // HashFromImage. This is a convenience wrapper around HashFromImage that handles the IO and decoding for you. 100 + // Ideally, you should call HashFromImage on your own with a 512x512 or smaller image that you have resized 101 + // yourself. This function is provided only to match the reference implementation. 102 + func HashFromFile(filename string) (*HashResult, error) { 103 + if filename == "" { 104 + return nil, ErrInvalidFile 105 + } 106 + 107 + file, err := os.Open(filename) 108 + if err != nil { 109 + return nil, fmt.Errorf("failed to open file for hashing: %w", err) 110 + } 111 + defer file.Close() 112 + 113 + img, _, err := image.Decode(file) 114 + if err != nil { 115 + return nil, fmt.Errorf("failed to decode image: %w", err) 116 + } 117 + 118 + return HashFromImage(img) 119 + } 120 + 121 + func loadFloatLumaFromImage(img image.Image) ([]float32, int, int) { 122 + bounds := img.Bounds() 123 + numRows := bounds.Dy() 124 + numCols := bounds.Dx() 125 + luma := make([]float32, numRows*numCols) 126 + 127 + for row := range numRows { 128 + for col := range numCols { 129 + // purposefully discarding alpha 130 + r, g, b, _ := img.At(bounds.Min.X+col, bounds.Min.Y+row).RGBA() 131 + 132 + r8 := float32(r >> 8) 133 + g8 := float32(g >> 8) 134 + b8 := float32(b >> 8) 135 + 136 + luma[row*numCols+col] = LumaFromRCoeff*r8 + LumaFromGCoeff*g8 + LumaFromBCoeff*b8 137 + } 138 + } 139 + 140 + return luma, numRows, numCols 141 + } 142 + 143 + // SEE: https://github.com/facebook/ThreatExchange/blob/main/pdq/cpp/hashing/pdqhashing.cpp#L127 144 + func hash256FromFloatLuma( 145 + fullBuffer1 []float32, 146 + fullBuffer2 []float32, 147 + numRows, numCols int, 148 + ) (string, int) { 149 + // from reference impl, do not return a hash for images taht are too small 150 + if numRows < MinHashableDim || numCols < MinHashableDim { 151 + return "", 0 152 + } 153 + 154 + buffer64x64 := make([]float32, 64*64) 155 + buffer16x64 := make([]float32, 16*64) 156 + buffer16x16 := make([]float32, 16*16) 157 + 158 + quality := float256FromFloatLuma(fullBuffer1, fullBuffer2, numRows, numCols, buffer64x64, buffer16x64, buffer16x16) 159 + 160 + hash := convertBufferToHash(buffer16x16) 161 + 162 + return hash, quality 163 + } 164 + 165 + const hexChars = "0123456789abcdef" 166 + 167 + func convertBufferToHash(buffer16x16 []float32) string { 168 + median := torben(buffer16x16) 169 + 170 + words := make([]uint16, 16) 171 + 172 + for i := range 16 { 173 + for j := range 16 { 174 + if buffer16x16[i*16+j] > median { 175 + bitIndex := i*16 + j 176 + wordIndex := bitIndex / 16 177 + bitInWord := bitIndex % 16 178 + words[wordIndex] |= 1 << bitInWord 179 + } 180 + } 181 + } 182 + 183 + result := make([]byte, 64) 184 + for i := range 16 { 185 + word := words[15-i] 186 + offset := i * 4 187 + result[offset+0] = hexChars[word>>12] 188 + result[offset+1] = hexChars[(word>>8)&0xF] 189 + result[offset+2] = hexChars[(word>>4)&0xF] 190 + result[offset+3] = hexChars[word&0xF] 191 + } 192 + 193 + return string(result) 194 + } 195 + 196 + // SEE: https://github.com/facebook/ThreatExchange/blob/main/pdq/cpp/hashing/pdqhashing.cpp#L158 197 + func float256FromFloatLuma( 198 + fullBuffer1 []float32, 199 + fullBuffer2 []float32, 200 + numRows, numCols int, 201 + buffer64x64 []float32, 202 + buffer16x64 []float32, 203 + buffer16x16 []float32, 204 + ) int { 205 + if numRows == 64 && numCols == 64 { 206 + copy(buffer64x64, fullBuffer1) 207 + } else { 208 + windowSizeAlongRows := computeJaroszFilterWindowSize(numCols, 64) 209 + windowSizeAlongCols := computeJaroszFilterWindowSize(numRows, 64) 210 + 211 + jaroszFilterFloat(fullBuffer1, fullBuffer2, numRows, numCols, windowSizeAlongRows, windowSizeAlongCols, PdqNumJaroszXYPasses) 212 + 213 + decimateFloat(fullBuffer1, numRows, numCols, buffer64x64, 64, 64) 214 + } 215 + 216 + quality := imageDomainQualityMetric(buffer64x64) 217 + 218 + dct64To16(buffer64x64, buffer16x64, buffer16x16) 219 + 220 + return quality 221 + } 222 + 223 + // SEE: https://github.com/facebook/ThreatExchange/blob/main/pdq/cpp/hashing/pdqhashing.cpp#L318 224 + func imageDomainQualityMetric(buffer64x64 []float32) int { 225 + gradientSum := 0 226 + 227 + for i := range 63 { 228 + for j := range 64 { 229 + u := buffer64x64[i*64+j] 230 + v := buffer64x64[(i+1)*64+j] 231 + d := int(math.Abs(float64((u - v) * 100 / 255))) 232 + gradientSum += d 233 + } 234 + } 235 + 236 + for i := range 64 { 237 + for j := range 63 { 238 + u := buffer64x64[i*64+j] 239 + v := buffer64x64[i*64+j+1] 240 + d := int(math.Abs(float64((u - v) * 100 / 255))) 241 + gradientSum += d 242 + } 243 + } 244 + 245 + quality := min(gradientSum/90, 100) 246 + 247 + return quality 248 + } 249 + 250 + // SEE: https://github.com/facebook/ThreatExchange/blob/main/pdq/cpp/hashing/pdqhashing.cpp#L355 251 + func dct64To16(A []float32, T []float32, B []float32) { 252 + for i := range 16 { 253 + dctRow := dctMatrix64[i*64:] 254 + for j := range 64 { 255 + var sum0, sum1, sum2, sum3 float32 256 + 257 + for k := 0; k < 64; k += 4 { 258 + sum0 += dctRow[k] * A[k*64+j] 259 + sum1 += dctRow[k+1] * A[(k+1)*64+j] 260 + sum2 += dctRow[k+2] * A[(k+2)*64+j] 261 + sum3 += dctRow[k+3] * A[(k+3)*64+j] 262 + } 263 + 264 + T[i*64+j] = sum0 + sum1 + sum2 + sum3 265 + } 266 + } 267 + 268 + for i := range 16 { 269 + tRow := T[i*64:] 270 + for j := range 16 { 271 + dctRow := dctMatrix64[j*64:] 272 + var sum0, sum1, sum2, sum3 float32 273 + 274 + for k := 0; k < 64; k += 4 { 275 + sum0 += tRow[k] * dctRow[k] 276 + sum1 += tRow[k+1] * dctRow[k+1] 277 + sum2 += tRow[k+2] * dctRow[k+2] 278 + sum3 += tRow[k+3] * dctRow[k+3] 279 + } 280 + 281 + B[i*16+j] = sum0 + sum1 + sum2 + sum3 282 + } 283 + } 284 + } 285 + 286 + // SEE: https://github.com/facebook/ThreatExchange/blob/main/pdq/cpp/hashing/torben.cpp 287 + func torben(m []float32) float32 { 288 + n := len(m) 289 + if n == 0 { 290 + return 0 291 + } 292 + 293 + min, max := m[0], m[0] 294 + for i := 1; i < n; i++ { 295 + if m[i] < min { 296 + min = m[i] 297 + } 298 + if m[i] > max { 299 + max = m[i] 300 + } 301 + } 302 + 303 + var guess, maxltguess, mingtguess float32 304 + var less, greater, equal int 305 + 306 + for { 307 + guess = (min + max) / 2 308 + less, greater, equal = 0, 0, 0 309 + maxltguess = min 310 + mingtguess = max 311 + 312 + for i := range n { 313 + if m[i] < guess { 314 + less++ 315 + if m[i] > maxltguess { 316 + maxltguess = m[i] 317 + } 318 + } else if m[i] > guess { 319 + greater++ 320 + if m[i] < mingtguess { 321 + mingtguess = m[i] 322 + } 323 + } else { 324 + equal++ 325 + } 326 + } 327 + 328 + if less <= (n+1)/2 && greater <= (n+1)/2 { 329 + break 330 + } else if less > greater { 331 + max = maxltguess 332 + } else { 333 + min = mingtguess 334 + } 335 + } 336 + 337 + if less >= (n+1)/2 { 338 + return maxltguess 339 + } else if less+equal >= (n+1)/2 { 340 + return guess 341 + } else { 342 + return mingtguess 343 + } 344 + }
+46
setup_testdata.py
··· 1 + #!/usr/bin/env python3 2 + 3 + import os 4 + import sys 5 + import urllib.request 6 + from pathlib import Path 7 + import time 8 + 9 + TESTDATA_DIR = "testdata/images" 10 + 11 + 12 + def download_images(num_images=50): 13 + """Download test images of various sizes""" 14 + 15 + Path(TESTDATA_DIR).mkdir(parents=True, exist_ok=True) 16 + 17 + print(f"Downloading {num_images} images from Lorem Picsum...") 18 + print() 19 + 20 + for i in range(1, num_images + 1): 21 + if i % 3 == 0: 22 + width, height = 400, 300 23 + elif i % 3 == 1: 24 + width, height = 800, 600 25 + else: 26 + width, height = 1920, 1080 27 + 28 + url = f"https://picsum.photos/{width}/{height}?random={i}" 29 + output_path = os.path.join(TESTDATA_DIR, f"test_image_{i}.jpg") 30 + 31 + try: 32 + print(f"Downloading image {i}/{num_images} ({width}x{height})...", end=" ") 33 + urllib.request.urlretrieve(url, output_path) 34 + time.sleep(0.1) 35 + 36 + except Exception as e: 37 + print(f"Failed to download image: {e}") 38 + 39 + print() 40 + print("Setup complete!") 41 + print(f"Downloaded images to: {TESTDATA_DIR}") 42 + 43 + 44 + if __name__ == "__main__": 45 + num_images = int(sys.argv[1]) if len(sys.argv) > 1 else 50 46 + download_images(num_images)