···1--- Initial comprehensive schema for Teal music platform
2--- Based on services/cadet/sql/base.sql
3-4-CREATE TABLE artists (
5- mbid UUID PRIMARY KEY,
6- name TEXT NOT NULL,
7- play_count INTEGER DEFAULT 0
8-);
9-10--- releases are synologous to 'albums'
11-CREATE TABLE releases (
12- mbid UUID PRIMARY KEY,
13- name TEXT NOT NULL,
14- play_count INTEGER DEFAULT 0
15-);
16-17--- recordings are synologous to 'tracks' BUT tracks can be in multiple releases!
18-CREATE TABLE recordings (
19- mbid UUID PRIMARY KEY,
20- name TEXT NOT NULL,
21- play_count INTEGER DEFAULT 0
22-);
23-24-CREATE TABLE plays (
25- uri TEXT PRIMARY KEY,
26- did TEXT NOT NULL,
27- rkey TEXT NOT NULL,
28- cid TEXT NOT NULL,
29- isrc TEXT,
30- duration INTEGER,
31- track_name TEXT NOT NULL,
32- played_time TIMESTAMP WITH TIME ZONE,
33- processed_time TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
34- release_mbid UUID,
35- release_name TEXT,
36- recording_mbid UUID,
37- submission_client_agent TEXT,
38- music_service_base_domain TEXT,
39- origin_url TEXT,
40- FOREIGN KEY (release_mbid) REFERENCES releases (mbid),
41- FOREIGN KEY (recording_mbid) REFERENCES recordings (mbid)
42-);
43-44-CREATE INDEX idx_plays_release_mbid ON plays (release_mbid);
45-CREATE INDEX idx_plays_recording_mbid ON plays (recording_mbid);
46-CREATE INDEX idx_plays_played_time ON plays (played_time);
47-CREATE INDEX idx_plays_did ON plays (did);
48-49-CREATE TABLE play_to_artists (
50- play_uri TEXT, -- references plays(uri)
51- artist_mbid UUID REFERENCES artists (mbid),
52- artist_name TEXT, -- storing here for ease of use when joining
53- PRIMARY KEY (play_uri, artist_mbid),
54- FOREIGN KEY (play_uri) REFERENCES plays (uri)
55-);
56-57-CREATE INDEX idx_play_to_artists_artist ON play_to_artists (artist_mbid);
58-59--- Profiles table
60-CREATE TABLE profiles (
61- did TEXT PRIMARY KEY,
62- handle TEXT,
63- display_name TEXT,
64- description TEXT,
65- description_facets JSONB,
66- avatar TEXT, -- IPLD of the image, bafy...
67- banner TEXT,
68- created_at TIMESTAMP WITH TIME ZONE
69-);
70-71--- User featured items table
72-CREATE TABLE featured_items (
73- did TEXT PRIMARY KEY,
74- mbid TEXT NOT NULL,
75- type TEXT NOT NULL
76-);
77-78--- Statii table (status records)
79-CREATE TABLE statii (
80- uri TEXT PRIMARY KEY,
81- did TEXT NOT NULL,
82- rkey TEXT NOT NULL,
83- cid TEXT NOT NULL,
84- record JSONB NOT NULL,
85- indexed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
86-);
87-88-CREATE INDEX idx_statii_did_rkey ON statii (did, rkey);
89-90--- Materialized view for artists' play counts
91-CREATE MATERIALIZED VIEW mv_artist_play_counts AS
92-SELECT
93- a.mbid AS artist_mbid,
94- a.name AS artist_name,
95- COUNT(p.uri) AS play_count
96-FROM
97- artists a
98- LEFT JOIN play_to_artists pta ON a.mbid = pta.artist_mbid
99- LEFT JOIN plays p ON p.uri = pta.play_uri
100-GROUP BY
101- a.mbid,
102- a.name;
103-104-CREATE UNIQUE INDEX idx_mv_artist_play_counts ON mv_artist_play_counts (artist_mbid);
105-106--- Materialized view for releases' play counts
107-CREATE MATERIALIZED VIEW mv_release_play_counts AS
108-SELECT
109- r.mbid AS release_mbid,
110- r.name AS release_name,
111- COUNT(p.uri) AS play_count
112-FROM
113- releases r
114- LEFT JOIN plays p ON p.release_mbid = r.mbid
115-GROUP BY
116- r.mbid,
117- r.name;
118-119-CREATE UNIQUE INDEX idx_mv_release_play_counts ON mv_release_play_counts (release_mbid);
120-121--- Materialized view for recordings' play counts
122-CREATE MATERIALIZED VIEW mv_recording_play_counts AS
123-SELECT
124- rec.mbid AS recording_mbid,
125- rec.name AS recording_name,
126- COUNT(p.uri) AS play_count
127-FROM
128- recordings rec
129- LEFT JOIN plays p ON p.recording_mbid = rec.mbid
130-GROUP BY
131- rec.mbid,
132- rec.name;
133-134-CREATE UNIQUE INDEX idx_mv_recording_play_counts ON mv_recording_play_counts (recording_mbid);
135-136--- Global play count materialized view
137-CREATE MATERIALIZED VIEW mv_global_play_count AS
138-SELECT
139- COUNT(uri) AS total_plays,
140- COUNT(DISTINCT did) AS unique_listeners
141-FROM plays;
142-143-CREATE UNIQUE INDEX idx_mv_global_play_count ON mv_global_play_count(total_plays);
144-145--- Top artists in the last 30 days
146-CREATE MATERIALIZED VIEW mv_top_artists_30days AS
147-SELECT
148- a.mbid AS artist_mbid,
149- a.name AS artist_name,
150- COUNT(p.uri) AS play_count
151-FROM artists a
152-INNER JOIN play_to_artists pta ON a.mbid = pta.artist_mbid
153-INNER JOIN plays p ON p.uri = pta.play_uri
154-WHERE p.played_time >= NOW() - INTERVAL '30 days'
155-GROUP BY a.mbid, a.name
156-ORDER BY COUNT(p.uri) DESC;
157-158--- Top releases in the last 30 days
159-CREATE MATERIALIZED VIEW mv_top_releases_30days AS
160-SELECT
161- r.mbid AS release_mbid,
162- r.name AS release_name,
163- COUNT(p.uri) AS play_count
164-FROM releases r
165-INNER JOIN plays p ON p.release_mbid = r.mbid
166-WHERE p.played_time >= NOW() - INTERVAL '30 days'
167-GROUP BY r.mbid, r.name
168-ORDER BY COUNT(p.uri) DESC;
169-170--- Top artists for user in the last 30 days
171-CREATE MATERIALIZED VIEW mv_top_artists_for_user_30days AS
172-SELECT
173- prof.did,
174- a.mbid AS artist_mbid,
175- a.name AS artist_name,
176- COUNT(p.uri) AS play_count
177-FROM artists a
178-INNER JOIN play_to_artists pta ON a.mbid = pta.artist_mbid
179-INNER JOIN plays p ON p.uri = pta.play_uri
180-INNER JOIN profiles prof ON prof.did = p.did
181-WHERE p.played_time >= NOW() - INTERVAL '30 days'
182-GROUP BY prof.did, a.mbid, a.name
183-ORDER BY COUNT(p.uri) DESC;
184-185--- Top artists for user in the last 7 days
186-CREATE MATERIALIZED VIEW mv_top_artists_for_user_7days AS
187-SELECT
188- prof.did,
189- a.mbid AS artist_mbid,
190- a.name AS artist_name,
191- COUNT(p.uri) AS play_count
192-FROM artists a
193-INNER JOIN play_to_artists pta ON a.mbid = pta.artist_mbid
194-INNER JOIN plays p ON p.uri = pta.play_uri
195-INNER JOIN profiles prof ON prof.did = p.did
196-WHERE p.played_time >= NOW() - INTERVAL '7 days'
197-GROUP BY prof.did, a.mbid, a.name
198-ORDER BY COUNT(p.uri) DESC;
199-200--- Top releases for user in the last 30 days
201-CREATE MATERIALIZED VIEW mv_top_releases_for_user_30days AS
202-SELECT
203- prof.did,
204- r.mbid AS release_mbid,
205- r.name AS release_name,
206- COUNT(p.uri) AS play_count
207-FROM releases r
208-INNER JOIN plays p ON p.release_mbid = r.mbid
209-INNER JOIN profiles prof ON prof.did = p.did
210-WHERE p.played_time >= NOW() - INTERVAL '30 days'
211-GROUP BY prof.did, r.mbid, r.name
212-ORDER BY COUNT(p.uri) DESC;
213-214--- Top releases for user in the last 7 days
215-CREATE MATERIALIZED VIEW mv_top_releases_for_user_7days AS
216-SELECT
217- prof.did,
218- r.mbid AS release_mbid,
219- r.name AS release_name,
220- COUNT(p.uri) AS play_count
221-FROM releases r
222-INNER JOIN plays p ON p.release_mbid = r.mbid
223-INNER JOIN profiles prof ON prof.did = p.did
224-WHERE p.played_time >= NOW() - INTERVAL '7 days'
225-GROUP BY prof.did, r.mbid, r.name
226-ORDER BY COUNT(p.uri) DESC;
···1--- CAR import functionality tables
2--- For handling AT Protocol CAR file imports and processing
3-4--- Tracks uploaded CAR files that are queued for processing
5-CREATE TABLE IF NOT EXISTS car_import_requests (
6- import_id TEXT PRIMARY KEY,
7- car_data_base64 TEXT NOT NULL,
8- status TEXT NOT NULL DEFAULT 'pending', -- pending, processing, completed, failed
9- created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
10- processed_at TIMESTAMP WITH TIME ZONE,
11- error_message TEXT,
12- file_size_bytes INTEGER,
13- block_count INTEGER,
14- extracted_records_count INTEGER DEFAULT 0
15-);
16-17-CREATE INDEX idx_car_import_requests_status ON car_import_requests (status);
18-CREATE INDEX idx_car_import_requests_created_at ON car_import_requests (created_at);
19-20--- Tracks raw IPLD blocks extracted from CAR files
21-CREATE TABLE IF NOT EXISTS car_blocks (
22- cid TEXT PRIMARY KEY,
23- import_id TEXT NOT NULL REFERENCES car_import_requests(import_id),
24- block_data BYTEA NOT NULL,
25- decoded_successfully BOOLEAN DEFAULT FALSE,
26- collection_type TEXT, -- e.g., 'fm.teal.alpha.feed.play', 'commit', etc.
27- created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
28-);
29-30-CREATE INDEX idx_car_blocks_import_id ON car_blocks (import_id);
31-CREATE INDEX idx_car_blocks_collection_type ON car_blocks (collection_type);
32-33--- Tracks records extracted from CAR imports that were successfully processed
34-CREATE TABLE IF NOT EXISTS car_extracted_records (
35- id SERIAL PRIMARY KEY,
36- import_id TEXT NOT NULL REFERENCES car_import_requests(import_id),
37- cid TEXT NOT NULL REFERENCES car_blocks(cid),
38- collection_type TEXT NOT NULL,
39- record_uri TEXT, -- AT URI if applicable (e.g., for play records)
40- synthetic_did TEXT, -- DID assigned for CAR imports (e.g., 'car-import:123')
41- rkey TEXT,
42- extracted_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
43- processing_notes TEXT
44-);
45-46-CREATE INDEX idx_car_extracted_records_import_id ON car_extracted_records (import_id);
47-CREATE INDEX idx_car_extracted_records_collection_type ON car_extracted_records (collection_type);
48-CREATE INDEX idx_car_extracted_records_record_uri ON car_extracted_records (record_uri);
49-50--- Tracks import metadata and commit information
51-CREATE TABLE IF NOT EXISTS car_import_metadata (
52- import_id TEXT NOT NULL REFERENCES car_import_requests(import_id),
53- metadata_key TEXT NOT NULL,
54- metadata_value JSONB NOT NULL,
55- created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
56- PRIMARY KEY (import_id, metadata_key)
57-);
58-59-CREATE INDEX idx_car_import_metadata_key ON car_import_metadata (metadata_key);
···1--- Migration to support artists without MusicBrainz IDs
2--- This allows the system to comply with the Teal lexicon where only trackName is required
3-4--- Add a field to plays table to store raw artist names for records without MBIDs
5-ALTER TABLE plays ADD COLUMN artist_names_raw JSONB;
6-7--- Create a new artists table that doesn't require MBID as primary key
8-CREATE TABLE artists_extended (
9- id SERIAL PRIMARY KEY,
10- mbid UUID UNIQUE, -- Optional MusicBrainz ID
11- name TEXT NOT NULL,
12- name_normalized TEXT GENERATED ALWAYS AS (LOWER(TRIM(name))) STORED,
13- play_count INTEGER DEFAULT 0,
14- created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
15- updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
16-);
17-18--- Create index for efficient lookups
19-CREATE INDEX idx_artists_extended_mbid ON artists_extended (mbid) WHERE mbid IS NOT NULL;
20-CREATE INDEX idx_artists_extended_name_normalized ON artists_extended (name_normalized);
21-CREATE UNIQUE INDEX idx_artists_extended_name_unique ON artists_extended (name_normalized) WHERE mbid IS NULL;
22-23--- Create a new junction table that can handle both MBID and non-MBID artists
24-CREATE TABLE play_to_artists_extended (
25- play_uri TEXT NOT NULL REFERENCES plays(uri),
26- artist_id INTEGER NOT NULL REFERENCES artists_extended(id),
27- artist_name TEXT NOT NULL, -- Denormalized for performance
28- PRIMARY KEY (play_uri, artist_id)
29-);
30-31-CREATE INDEX idx_play_to_artists_extended_artist ON play_to_artists_extended (artist_id);
32-33--- Migrate existing data from old tables to new structure
34-INSERT INTO artists_extended (mbid, name, play_count)
35-SELECT mbid, name, play_count FROM artists;
36-37-INSERT INTO play_to_artists_extended (play_uri, artist_id, artist_name)
38-SELECT
39- pta.play_uri,
40- ae.id,
41- pta.artist_name
42-FROM play_to_artists pta
43-JOIN artists_extended ae ON ae.mbid = pta.artist_mbid;
44-45--- Update materialized views to use new structure
46-DROP MATERIALIZED VIEW IF EXISTS mv_artist_play_counts;
47-CREATE MATERIALIZED VIEW mv_artist_play_counts AS
48-SELECT
49- ae.id AS artist_id,
50- ae.mbid AS artist_mbid,
51- ae.name AS artist_name,
52- COUNT(p.uri) AS play_count
53-FROM
54- artists_extended ae
55- LEFT JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
56- LEFT JOIN plays p ON p.uri = ptae.play_uri
57-GROUP BY
58- ae.id, ae.mbid, ae.name;
59-60-CREATE UNIQUE INDEX idx_mv_artist_play_counts_new ON mv_artist_play_counts (artist_id);
61-62--- Update other materialized views that reference artists
63-DROP MATERIALIZED VIEW IF EXISTS mv_top_artists_30days;
64-CREATE MATERIALIZED VIEW mv_top_artists_30days AS
65-SELECT
66- ae.id AS artist_id,
67- ae.mbid AS artist_mbid,
68- ae.name AS artist_name,
69- COUNT(p.uri) AS play_count
70-FROM artists_extended ae
71-INNER JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
72-INNER JOIN plays p ON p.uri = ptae.play_uri
73-WHERE p.played_time >= NOW() - INTERVAL '30 days'
74-GROUP BY ae.id, ae.mbid, ae.name
75-ORDER BY COUNT(p.uri) DESC;
76-77-DROP MATERIALIZED VIEW IF EXISTS mv_top_artists_for_user_30days;
78-CREATE MATERIALIZED VIEW mv_top_artists_for_user_30days AS
79-SELECT
80- prof.did,
81- ae.id AS artist_id,
82- ae.mbid AS artist_mbid,
83- ae.name AS artist_name,
84- COUNT(p.uri) AS play_count
85-FROM artists_extended ae
86-INNER JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
87-INNER JOIN plays p ON p.uri = ptae.play_uri
88-INNER JOIN profiles prof ON prof.did = p.did
89-WHERE p.played_time >= NOW() - INTERVAL '30 days'
90-GROUP BY prof.did, ae.id, ae.mbid, ae.name
91-ORDER BY COUNT(p.uri) DESC;
92-93-DROP MATERIALIZED VIEW IF EXISTS mv_top_artists_for_user_7days;
94-CREATE MATERIALIZED VIEW mv_top_artists_for_user_7days AS
95-SELECT
96- prof.did,
97- ae.id AS artist_id,
98- ae.mbid AS artist_mbid,
99- ae.name AS artist_name,
100- COUNT(p.uri) AS play_count
101-FROM artists_extended ae
102-INNER JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
103-INNER JOIN plays p ON p.uri = ptae.play_uri
104-INNER JOIN profiles prof ON prof.did = p.did
105-WHERE p.played_time >= NOW() - INTERVAL '7 days'
106-GROUP BY prof.did, ae.id, ae.mbid, ae.name
107-ORDER BY COUNT(p.uri) DESC;
108-109--- Comment explaining the migration strategy
110-COMMENT ON TABLE artists_extended IS 'Extended artists table that supports both MusicBrainz and non-MusicBrainz artists. Uses serial ID as primary key with optional MBID.';
111-COMMENT ON TABLE play_to_artists_extended IS 'Junction table linking plays to artists using the new artists_extended table structure.';
112-COMMENT ON COLUMN plays.artist_names_raw IS 'Raw artist names as JSON array for plays without MusicBrainz data, used as fallback when artist relationships cannot be established.';
···1--- Migration to support synthetic MBIDs for artists without MusicBrainz data
2--- This ensures all artists have some form of ID while maintaining uniqueness
3-4--- Enable UUID extension for v5 UUID generation
5-CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
6-7--- Add a column to track MBID type (musicbrainz, synthetic, unknown)
8-ALTER TABLE artists_extended ADD COLUMN mbid_type TEXT DEFAULT 'unknown' NOT NULL;
9-10--- Add check constraint for valid MBID types
11-ALTER TABLE artists_extended ADD CONSTRAINT chk_mbid_type
12- CHECK (mbid_type IN ('musicbrainz', 'synthetic', 'unknown'));
13-14--- Update existing records to set proper MBID type
15-UPDATE artists_extended SET mbid_type = 'musicbrainz' WHERE mbid IS NOT NULL;
16-17--- Drop the unique constraint on name_normalized for null MBIDs since we'll handle duplicates differently
18-DROP INDEX IF EXISTS idx_artists_extended_name_unique;
19-20--- Add index for efficient querying by MBID type
21-CREATE INDEX idx_artists_extended_mbid_type ON artists_extended (mbid_type);
22-23--- Create a view to easily work with different artist types
24-CREATE VIEW artists_with_type AS
25-SELECT
26- id,
27- mbid,
28- name,
29- mbid_type,
30- play_count,
31- created_at,
32- updated_at,
33- -- For synthetic MBIDs, we can show the source name used for generation
34- CASE
35- WHEN mbid_type = 'synthetic' THEN 'Generated from: ' || name
36- WHEN mbid_type = 'musicbrainz' THEN 'MusicBrainz: ' || mbid::text
37- ELSE 'No MBID available'
38- END as mbid_info
39-FROM artists_extended;
40-41--- Update materialized views to include MBID type information
42-DROP MATERIALIZED VIEW IF EXISTS mv_artist_play_counts;
43-CREATE MATERIALIZED VIEW mv_artist_play_counts AS
44-SELECT
45- ae.id AS artist_id,
46- ae.mbid AS artist_mbid,
47- ae.name AS artist_name,
48- ae.mbid_type,
49- COUNT(p.uri) AS play_count
50-FROM
51- artists_extended ae
52- LEFT JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
53- LEFT JOIN plays p ON p.uri = ptae.play_uri
54-GROUP BY
55- ae.id, ae.mbid, ae.name, ae.mbid_type;
56-57-CREATE UNIQUE INDEX idx_mv_artist_play_counts_with_type ON mv_artist_play_counts (artist_id);
58-59--- Add comments explaining the synthetic MBID system
60-COMMENT ON COLUMN artists_extended.mbid_type IS 'Type of MBID: musicbrainz (real), synthetic (generated), or unknown (legacy data)';
61-COMMENT ON COLUMN artists_extended.mbid IS 'MusicBrainz ID (for musicbrainz type) or synthetic UUID (for synthetic type)';
62-COMMENT ON VIEW artists_with_type IS 'View that provides human-readable information about artist MBID sources';
63-64--- Add a function to generate synthetic MBIDs
65-CREATE OR REPLACE FUNCTION generate_synthetic_mbid(artist_name TEXT) RETURNS UUID AS $$
66-DECLARE
67- namespace_uuid UUID := '6ba7b810-9dad-11d1-80b4-00c04fd430c8'; -- DNS namespace
68- result_uuid UUID;
69-BEGIN
70- -- Generate deterministic UUID v5 based on artist name
71- SELECT uuid_generate_v5(namespace_uuid, artist_name) INTO result_uuid;
72- RETURN result_uuid;
73-END;
74-$$ LANGUAGE plpgsql IMMUTABLE;
75-76-COMMENT ON FUNCTION generate_synthetic_mbid IS 'Generates a deterministic UUID v5 for artist names without MusicBrainz IDs';
···1--- Migration to add fuzzy text matching capabilities
2--- This enables better artist name matching using trigram similarity
3-4--- Enable pg_trgm extension for trigram similarity matching
5-CREATE EXTENSION IF NOT EXISTS pg_trgm;
6-7--- Create indexes for efficient trigram matching on artist names
8-CREATE INDEX idx_artists_extended_name_trgm ON artists_extended USING gin (name gin_trgm_ops);
9-CREATE INDEX idx_artists_extended_name_normalized_trgm ON artists_extended USING gin (name_normalized gin_trgm_ops);
10-11--- Create a function to calculate comprehensive artist similarity
12-CREATE OR REPLACE FUNCTION calculate_artist_similarity(
13- input_name TEXT,
14- existing_name TEXT,
15- input_album TEXT DEFAULT NULL,
16- existing_album TEXT DEFAULT NULL
17-) RETURNS FLOAT AS $$
18-DECLARE
19- name_similarity FLOAT;
20- album_similarity FLOAT := 0.0;
21- final_score FLOAT;
22-BEGIN
23- -- Calculate trigram similarity for artist names
24- name_similarity := similarity(LOWER(TRIM(input_name)), LOWER(TRIM(existing_name)));
25-26- -- Boost for exact matches after normalization
27- IF LOWER(TRIM(regexp_replace(input_name, '[^a-zA-Z0-9\s]', '', 'g'))) =
28- LOWER(TRIM(regexp_replace(existing_name, '[^a-zA-Z0-9\s]', '', 'g'))) THEN
29- name_similarity := GREATEST(name_similarity, 0.95);
30- END IF;
31-32- -- Factor in album similarity if both are provided
33- IF input_album IS NOT NULL AND existing_album IS NOT NULL THEN
34- album_similarity := similarity(LOWER(TRIM(input_album)), LOWER(TRIM(existing_album)));
35- -- Weight: 80% name, 20% album
36- final_score := (name_similarity * 0.8) + (album_similarity * 0.2);
37- ELSE
38- final_score := name_similarity;
39- END IF;
40-41- RETURN final_score;
42-END;
43-$$ LANGUAGE plpgsql IMMUTABLE;
44-45--- Create a view for fuzzy artist matching with confidence scores
46-CREATE VIEW fuzzy_artist_matches AS
47-SELECT DISTINCT
48- ae1.id as query_artist_id,
49- ae1.name as query_artist_name,
50- ae1.mbid_type as query_mbid_type,
51- ae2.id as match_artist_id,
52- ae2.name as match_artist_name,
53- ae2.mbid as match_mbid,
54- ae2.mbid_type as match_mbid_type,
55- similarity(LOWER(TRIM(ae1.name)), LOWER(TRIM(ae2.name))) as name_similarity,
56- CASE
57- WHEN ae2.mbid_type = 'musicbrainz' THEN 'upgrade_to_mb'
58- WHEN ae1.mbid_type = 'musicbrainz' AND ae2.mbid_type = 'synthetic' THEN 'consolidate_to_mb'
59- ELSE 'merge_synthetic'
60- END as match_action
61-FROM artists_extended ae1
62-CROSS JOIN artists_extended ae2
63-WHERE ae1.id != ae2.id
64-AND similarity(LOWER(TRIM(ae1.name)), LOWER(TRIM(ae2.name))) > 0.8
65-AND (
66- ae1.mbid_type = 'synthetic' OR ae2.mbid_type = 'musicbrainz'
67-);
68-69--- Add comments
70-COMMENT ON EXTENSION pg_trgm IS 'Trigram extension for fuzzy text matching';
71-COMMENT ON INDEX idx_artists_extended_name_trgm IS 'GIN index for trigram similarity on artist names';
72-COMMENT ON FUNCTION calculate_artist_similarity IS 'Calculates similarity score between artists considering name and optional album context';
73-COMMENT ON VIEW fuzzy_artist_matches IS 'Shows potential artist matches with confidence scores and recommended actions';
74-75--- Create a function to suggest artist consolidations
76-CREATE OR REPLACE FUNCTION suggest_artist_consolidations(min_similarity FLOAT DEFAULT 0.9)
77-RETURNS TABLE(
78- action TEXT,
79- synthetic_artist TEXT,
80- target_artist TEXT,
81- similarity_score FLOAT,
82- synthetic_plays INTEGER,
83- target_plays INTEGER
84-) AS $$
85-BEGIN
86- RETURN QUERY
87- SELECT
88- fam.match_action as action,
89- fam.query_artist_name as synthetic_artist,
90- fam.match_artist_name as target_artist,
91- fam.name_similarity as similarity_score,
92- (SELECT COUNT(*)::INTEGER FROM play_to_artists_extended WHERE artist_id = fam.query_artist_id) as synthetic_plays,
93- (SELECT COUNT(*)::INTEGER FROM play_to_artists_extended WHERE artist_id = fam.match_artist_id) as target_plays
94- FROM fuzzy_artist_matches fam
95- WHERE fam.name_similarity >= min_similarity
96- AND fam.match_action = 'upgrade_to_mb'
97- ORDER BY fam.name_similarity DESC, synthetic_plays DESC;
98-END;
99-$$ LANGUAGE plpgsql;
100-101-COMMENT ON FUNCTION suggest_artist_consolidations IS 'Returns suggestions for consolidating synthetic artists with MusicBrainz artists based on similarity';
···1--- Migration to add discriminant fields for track and release variants
2--- This enables proper handling of different versions while maintaining grouping capabilities
3-4--- Add discriminant fields to plays table
5-ALTER TABLE plays ADD COLUMN track_discriminant TEXT;
6-ALTER TABLE plays ADD COLUMN release_discriminant TEXT;
7-8--- Add discriminant field to releases table
9-ALTER TABLE releases ADD COLUMN discriminant TEXT;
10-11--- Add discriminant field to recordings table
12-ALTER TABLE recordings ADD COLUMN discriminant TEXT;
13-14--- Create indexes for efficient searching and filtering
15-CREATE INDEX idx_plays_track_discriminant ON plays (track_discriminant);
16-CREATE INDEX idx_plays_release_discriminant ON plays (release_discriminant);
17-CREATE INDEX idx_releases_discriminant ON releases (discriminant);
18-CREATE INDEX idx_recordings_discriminant ON recordings (discriminant);
19-20--- Create composite indexes for grouping by base name + discriminant
21-CREATE INDEX idx_plays_track_name_discriminant ON plays (track_name, track_discriminant);
22-CREATE INDEX idx_plays_release_name_discriminant ON plays (release_name, release_discriminant);
23-24--- Update materialized views to include discriminant information
25-DROP MATERIALIZED VIEW IF EXISTS mv_release_play_counts;
26-CREATE MATERIALIZED VIEW mv_release_play_counts AS
27-SELECT
28- r.mbid AS release_mbid,
29- r.name AS release_name,
30- r.discriminant AS release_discriminant,
31- COUNT(p.uri) AS play_count
32-FROM
33- releases r
34- LEFT JOIN plays p ON p.release_mbid = r.mbid
35-GROUP BY
36- r.mbid, r.name, r.discriminant;
37-38-CREATE UNIQUE INDEX idx_mv_release_play_counts_discriminant ON mv_release_play_counts (release_mbid);
39-40-DROP MATERIALIZED VIEW IF EXISTS mv_recording_play_counts;
41-CREATE MATERIALIZED VIEW mv_recording_play_counts AS
42-SELECT
43- rec.mbid AS recording_mbid,
44- rec.name AS recording_name,
45- rec.discriminant AS recording_discriminant,
46- COUNT(p.uri) AS play_count
47-FROM
48- recordings rec
49- LEFT JOIN plays p ON p.recording_mbid = rec.mbid
50-GROUP BY
51- rec.mbid, rec.name, rec.discriminant;
52-53-CREATE UNIQUE INDEX idx_mv_recording_play_counts_discriminant ON mv_recording_play_counts (recording_mbid);
54-55--- Create views for analyzing track/release variants
56-CREATE VIEW track_variants AS
57-SELECT
58- track_name,
59- track_discriminant,
60- COUNT(*) AS play_count,
61- COUNT(DISTINCT did) AS unique_listeners,
62- COUNT(DISTINCT recording_mbid) AS unique_recordings
63-FROM plays
64-WHERE track_name IS NOT NULL
65-GROUP BY track_name, track_discriminant
66-ORDER BY track_name, play_count DESC;
67-68-CREATE VIEW release_variants AS
69-SELECT
70- release_name,
71- release_discriminant,
72- COUNT(*) AS play_count,
73- COUNT(DISTINCT did) AS unique_listeners,
74- COUNT(DISTINCT release_mbid) AS unique_releases
75-FROM plays
76-WHERE release_name IS NOT NULL
77-GROUP BY release_name, release_discriminant
78-ORDER BY release_name, play_count DESC;
79-80--- Create function to extract potential discriminants from existing names
81-CREATE OR REPLACE FUNCTION extract_discriminant(name_text TEXT) RETURNS TEXT AS $$
82-DECLARE
83- discriminant_patterns TEXT[] := ARRAY[
84- '\(([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?)\)',
85- '\[([^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?)\]',
86- '\{([^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?)\}'
87- ];
88- pattern TEXT;
89- match_result TEXT;
90-BEGIN
91- -- Try each pattern to find discriminant information
92- FOREACH pattern IN ARRAY discriminant_patterns
93- LOOP
94- SELECT substring(name_text FROM pattern) INTO match_result;
95- IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
96- RETURN trim(match_result);
97- END IF;
98- END LOOP;
99-100- RETURN NULL;
101-END;
102-$$ LANGUAGE plpgsql IMMUTABLE;
103-104--- Create function to get base name without discriminant
105-CREATE OR REPLACE FUNCTION get_base_name(name_text TEXT) RETURNS TEXT AS $$
106-DECLARE
107- cleanup_patterns TEXT[] := ARRAY[
108- '\s*\([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?\)\s*',
109- '\s*\[[^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?\]\s*',
110- '\s*\{[^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?\}\s*'
111- ];
112- pattern TEXT;
113- result_text TEXT := name_text;
114-BEGIN
115- -- Remove discriminant patterns to get base name
116- FOREACH pattern IN ARRAY cleanup_patterns
117- LOOP
118- result_text := regexp_replace(result_text, pattern, ' ', 'gi');
119- END LOOP;
120-121- -- Clean up extra whitespace
122- result_text := regexp_replace(trim(result_text), '\s+', ' ', 'g');
123-124- RETURN result_text;
125-END;
126-$$ LANGUAGE plpgsql IMMUTABLE;
127-128--- Add comments explaining the discriminant system
129-COMMENT ON COLUMN plays.track_discriminant IS 'Distinguishing information for track variants (e.g., "Acoustic Version", "Live at Wembley", "Radio Edit")';
130-COMMENT ON COLUMN plays.release_discriminant IS 'Distinguishing information for release variants (e.g., "Deluxe Edition", "Remastered", "2023 Remaster")';
131-COMMENT ON COLUMN releases.discriminant IS 'Distinguishing information for release variants to enable proper grouping';
132-COMMENT ON COLUMN recordings.discriminant IS 'Distinguishing information for recording variants to enable proper grouping';
133-134-COMMENT ON VIEW track_variants IS 'Shows all variants of tracks with their play counts and unique listeners';
135-COMMENT ON VIEW release_variants IS 'Shows all variants of releases with their play counts and unique listeners';
136-137-COMMENT ON FUNCTION extract_discriminant IS 'Extracts discriminant information from track/release names for migration purposes';
138-COMMENT ON FUNCTION get_base_name IS 'Returns the base name without discriminant information for grouping purposes';
···1--- Enhanced discriminant extraction with comprehensive edition/version patterns
2--- This migration improves the auto-population of discriminants for better metadata handling
3-4--- Drop existing functions to replace them with enhanced versions
5-DROP FUNCTION IF EXISTS extract_discriminant(TEXT);
6-DROP FUNCTION IF EXISTS get_base_name(TEXT);
7-8--- Enhanced function to extract discriminants with comprehensive patterns
9-CREATE OR REPLACE FUNCTION extract_discriminant(name_text TEXT) RETURNS TEXT AS $$
10-DECLARE
11- -- Comprehensive patterns for discriminant extraction
12- discriminant_patterns TEXT[] := ARRAY[
13- -- Parentheses patterns
14- '\(([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\)',
15- '\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\)',
16- '\(([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\)',
17- '\(([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\)',
18- '\(([^)]*(?:from|soundtrack|ost|score|theme).*?)\)',
19-20- -- Brackets patterns
21- '\[([^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\]',
22- '\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\]',
23- '\[([^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\]',
24- '\[([^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\]',
25- '\[([^]]*(?:from|soundtrack|ost|score|theme).*?)\]',
26-27- -- Braces patterns
28- '\{([^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\}',
29- '\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\}',
30- '\{([^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\}',
31- '\{([^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\}',
32- '\{([^}]*(?:from|soundtrack|ost|score|theme).*?)\}',
33-34- -- Dash/hyphen patterns (common for editions)
35- '[-–—]\s*([^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?)$',
36- '[-–—]\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
37-38- -- Colon patterns (common for subtitles and versions)
39- ':\s*([^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?)$',
40- ':\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
41- ];
42-43- pattern TEXT;
44- match_result TEXT;
45-BEGIN
46- -- Return early if input is null or empty
47- IF name_text IS NULL OR trim(name_text) = '' THEN
48- RETURN NULL;
49- END IF;
50-51- -- Try each pattern to find discriminant information
52- FOREACH pattern IN ARRAY discriminant_patterns
53- LOOP
54- SELECT substring(name_text FROM pattern COLLATE "C") INTO match_result;
55- IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
56- -- Clean up the match result
57- match_result := trim(match_result);
58- -- Remove leading/trailing punctuation
59- match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
60- -- Ensure it's not just whitespace or empty after cleanup
61- IF length(trim(match_result)) > 0 THEN
62- RETURN match_result;
63- END IF;
64- END IF;
65- END LOOP;
66-67- RETURN NULL;
68-END;
69-$$ LANGUAGE plpgsql IMMUTABLE;
70-71--- Enhanced function to get base name without discriminant
72-CREATE OR REPLACE FUNCTION get_base_name(name_text TEXT) RETURNS TEXT AS $$
73-DECLARE
74- -- Comprehensive cleanup patterns matching the extraction patterns
75- cleanup_patterns TEXT[] := ARRAY[
76- -- Remove parentheses content
77- '\s*\([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\)\s*',
78- '\s*\([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\)\s*',
79- '\s*\([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\)\s*',
80- '\s*\([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\)\s*',
81- '\s*\([^)]*(?:from|soundtrack|ost|score|theme).*?\)\s*',
82-83- -- Remove brackets content
84- '\s*\[[^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\]\s*',
85- '\s*\[[^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\]\s*',
86- '\s*\[[^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\]\s*',
87- '\s*\[[^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\]\s*',
88- '\s*\[[^]]*(?:from|soundtrack|ost|score|theme).*?\]\s*',
89-90- -- Remove braces content
91- '\s*\{[^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\}\s*',
92- '\s*\{[^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\}\s*',
93- '\s*\{[^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\}\s*',
94- '\s*\{[^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\}\s*',
95- '\s*\{[^}]*(?:from|soundtrack|ost|score|theme).*?\}\s*',
96-97- -- Remove dash/hyphen patterns
98- '\s*[-–—]\s*[^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?$',
99- '\s*[-–—]\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
100-101- -- Remove colon patterns
102- '\s*:\s*[^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?$',
103- '\s*:\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
104- ];
105-106- pattern TEXT;
107- result_text TEXT := name_text;
108-BEGIN
109- -- Return early if input is null or empty
110- IF name_text IS NULL OR trim(name_text) = '' THEN
111- RETURN name_text;
112- END IF;
113-114- -- Remove discriminant patterns to get base name
115- FOREACH pattern IN ARRAY cleanup_patterns
116- LOOP
117- result_text := regexp_replace(result_text, pattern, ' ', 'gi');
118- END LOOP;
119-120- -- Clean up extra whitespace and normalize
121- result_text := regexp_replace(trim(result_text), '\s+', ' ', 'g');
122-123- -- Remove trailing punctuation that might be left after removal
124- result_text := regexp_replace(result_text, '[,;:\-–—]\s*$', '', 'g');
125- result_text := trim(result_text);
126-127- -- Ensure we don't return an empty string
128- IF length(result_text) = 0 THEN
129- RETURN name_text;
130- END IF;
131-132- RETURN result_text;
133-END;
134-$$ LANGUAGE plpgsql IMMUTABLE;
135-136--- Create function to extract discriminant specifically for editions and versions
137-CREATE OR REPLACE FUNCTION extract_edition_discriminant(name_text TEXT) RETURNS TEXT AS $$
138-DECLARE
139- -- Focused patterns for edition/version extraction
140- edition_patterns TEXT[] := ARRAY[
141- -- Edition patterns
142- '\(([^)]*edition[^)]*)\)',
143- '\[([^]]*edition[^]]*)\]',
144- '\{([^}]*edition[^}]*)\}',
145- '[-–—]\s*([^-–—]*edition[^-–—]*)$',
146- ':\s*([^:]*edition[^:]*)$',
147-148- -- Version patterns
149- '\(([^)]*version[^)]*)\)',
150- '\[([^]]*version[^]]*)\]',
151- '\{([^}]*version[^}]*)\}',
152- '[-–—]\s*([^-–—]*version[^-–—]*)$',
153- ':\s*([^:]*version[^:]*)$',
154-155- -- Remaster patterns
156- '\(([^)]*remaster[^)]*)\)',
157- '\[([^]]*remaster[^]]*)\]',
158- '\{([^}]*remaster[^}]*)\}',
159- '[-–—]\s*([^-–—]*remaster[^-–—]*)$',
160- ':\s*([^:]*remaster[^:]*)$',
161-162- -- Year-based patterns
163- '\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^)]*)\)',
164- '\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^]]*)\]',
165- '\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^}]*)\}'
166- ];
167-168- pattern TEXT;
169- match_result TEXT;
170-BEGIN
171- -- Return early if input is null or empty
172- IF name_text IS NULL OR trim(name_text) = '' THEN
173- RETURN NULL;
174- END IF;
175-176- -- Try edition-specific patterns first
177- FOREACH pattern IN ARRAY edition_patterns
178- LOOP
179- SELECT substring(name_text FROM pattern COLLATE "C") INTO match_result;
180- IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
181- match_result := trim(match_result);
182- match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
183- IF length(trim(match_result)) > 0 THEN
184- RETURN match_result;
185- END IF;
186- END IF;
187- END LOOP;
188-189- RETURN NULL;
190-END;
191-$$ LANGUAGE plpgsql IMMUTABLE;
192-193--- Update recordings table to populate discriminants from existing names
194-UPDATE recordings
195-SET discriminant = extract_discriminant(name)
196-WHERE discriminant IS NULL
197- AND extract_discriminant(name) IS NOT NULL;
198-199--- Update releases table to populate discriminants from existing names
200-UPDATE releases
201-SET discriminant = extract_discriminant(name)
202-WHERE discriminant IS NULL
203- AND extract_discriminant(name) IS NOT NULL;
204-205--- Update plays table to populate discriminants from existing names where not already set
206-UPDATE plays
207-SET track_discriminant = extract_discriminant(track_name)
208-WHERE track_discriminant IS NULL
209- AND extract_discriminant(track_name) IS NOT NULL;
210-211-UPDATE plays
212-SET release_discriminant = extract_discriminant(release_name)
213-WHERE release_discriminant IS NULL
214- AND release_name IS NOT NULL
215- AND extract_discriminant(release_name) IS NOT NULL;
216-217--- Create indexes for efficient discriminant queries
218-CREATE INDEX IF NOT EXISTS idx_recordings_name_discriminant ON recordings (name, discriminant);
219-CREATE INDEX IF NOT EXISTS idx_releases_name_discriminant ON releases (name, discriminant);
220-221--- Add comments for the new function
222-COMMENT ON FUNCTION extract_discriminant IS 'Enhanced discriminant extraction supporting comprehensive edition/version patterns including parentheses, brackets, braces, dashes, and colons';
223-COMMENT ON FUNCTION get_base_name IS 'Enhanced base name extraction removing comprehensive discriminant patterns to enable proper grouping';
224-COMMENT ON FUNCTION extract_edition_discriminant IS 'Specialized function for extracting edition and version discriminants with focused patterns';
225-226--- Create a view to show discriminant extraction results for analysis
227-CREATE OR REPLACE VIEW discriminant_analysis AS
228-SELECT
229- 'recordings' as table_name,
230- name as original_name,
231- discriminant,
232- get_base_name(name) as base_name,
233- extract_discriminant(name) as extracted_discriminant,
234- extract_edition_discriminant(name) as edition_discriminant
235-FROM recordings
236-WHERE name IS NOT NULL
237-UNION ALL
238-SELECT
239- 'releases' as table_name,
240- name as original_name,
241- discriminant,
242- get_base_name(name) as base_name,
243- extract_discriminant(name) as extracted_discriminant,
244- extract_edition_discriminant(name) as edition_discriminant
245-FROM releases
246-WHERE name IS NOT NULL;
247-248-COMMENT ON VIEW discriminant_analysis IS 'Analysis view showing discriminant extraction results for quality assessment and debugging';
249-250--- Refresh materialized views to include discriminant information
251-REFRESH MATERIALIZED VIEW mv_release_play_counts;
252-REFRESH MATERIALIZED VIEW mv_recording_play_counts;
253-254--- Create summary statistics for discriminant usage
255-CREATE OR REPLACE VIEW discriminant_stats AS
256-SELECT
257- 'recordings' as entity_type,
258- COUNT(*) as total_count,
259- COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) as with_discriminant,
260- COUNT(CASE WHEN discriminant IS NULL AND extract_discriminant(name) IS NOT NULL THEN 1 END) as extractable_discriminant,
261- ROUND(
262- COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) * 100.0 / COUNT(*), 2
263- ) as discriminant_percentage
264-FROM recordings
265-UNION ALL
266-SELECT
267- 'releases' as entity_type,
268- COUNT(*) as total_count,
269- COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) as with_discriminant,
270- COUNT(CASE WHEN discriminant IS NULL AND extract_discriminant(name) IS NOT NULL THEN 1 END) as extractable_discriminant,
271- ROUND(
272- COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) * 100.0 / COUNT(*), 2
273- ) as discriminant_percentage
274-FROM releases;
275-276-COMMENT ON VIEW discriminant_stats IS 'Statistics showing discriminant usage and extraction potential across entity types';
···1--- Fix case sensitivity in discriminant extraction patterns
2--- This migration updates the discriminant extraction functions to properly handle case-insensitive matching
3-4--- Drop dependent views first, then functions, then recreate everything
5-DROP VIEW IF EXISTS discriminant_analysis CASCADE;
6-DROP VIEW IF EXISTS discriminant_stats CASCADE;
7-8--- Drop existing functions to replace with case-insensitive versions
9-DROP FUNCTION IF EXISTS extract_discriminant(TEXT) CASCADE;
10-DROP FUNCTION IF EXISTS get_base_name(TEXT) CASCADE;
11-DROP FUNCTION IF EXISTS extract_edition_discriminant(TEXT) CASCADE;
12-13--- Enhanced function to extract discriminants with case-insensitive matching
14-CREATE OR REPLACE FUNCTION extract_discriminant(name_text TEXT) RETURNS TEXT AS $$
15-DECLARE
16- -- Comprehensive patterns for discriminant extraction with case-insensitive flags
17- discriminant_patterns TEXT[] := ARRAY[
18- -- Parentheses patterns
19- '(?i)\(([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\)',
20- '(?i)\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\)',
21- '(?i)\(([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\)',
22- '(?i)\(([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\)',
23- '(?i)\(([^)]*(?:from|soundtrack|ost|score|theme).*?)\)',
24-25- -- Brackets patterns
26- '(?i)\[([^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\]',
27- '(?i)\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\]',
28- '(?i)\[([^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\]',
29- '(?i)\[([^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\]',
30- '(?i)\[([^]]*(?:from|soundtrack|ost|score|theme).*?)\]',
31-32- -- Braces patterns
33- '(?i)\{([^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\}',
34- '(?i)\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\}',
35- '(?i)\{([^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\}',
36- '(?i)\{([^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\}',
37- '(?i)\{([^}]*(?:from|soundtrack|ost|score|theme).*?)\}',
38-39- -- Dash/hyphen patterns (common for editions)
40- '(?i)[-–—]\s*([^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?)$',
41- '(?i)[-–—]\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
42-43- -- Colon patterns (common for subtitles and versions)
44- '(?i):\s*([^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?)$',
45- '(?i):\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
46- ];
47-48- pattern TEXT;
49- match_result TEXT;
50-BEGIN
51- -- Return early if input is null or empty
52- IF name_text IS NULL OR trim(name_text) = '' THEN
53- RETURN NULL;
54- END IF;
55-56- -- Try each pattern to find discriminant information
57- FOREACH pattern IN ARRAY discriminant_patterns
58- LOOP
59- SELECT substring(name_text FROM pattern) INTO match_result;
60- IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
61- -- Clean up the match result
62- match_result := trim(match_result);
63- -- Remove leading/trailing punctuation
64- match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
65- -- Ensure it's not just whitespace or empty after cleanup
66- IF length(trim(match_result)) > 0 THEN
67- RETURN match_result;
68- END IF;
69- END IF;
70- END LOOP;
71-72- RETURN NULL;
73-END;
74-$$ LANGUAGE plpgsql IMMUTABLE;
75-76--- Enhanced function to get base name without discriminant with case-insensitive matching
77-CREATE OR REPLACE FUNCTION get_base_name(name_text TEXT) RETURNS TEXT AS $$
78-DECLARE
79- -- Comprehensive cleanup patterns matching the extraction patterns
80- cleanup_patterns TEXT[] := ARRAY[
81- -- Remove parentheses content
82- '(?i)\s*\([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\)\s*',
83- '(?i)\s*\([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\)\s*',
84- '(?i)\s*\([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\)\s*',
85- '(?i)\s*\([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\)\s*',
86- '(?i)\s*\([^)]*(?:from|soundtrack|ost|score|theme).*?\)\s*',
87-88- -- Remove brackets content
89- '(?i)\s*\[[^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\]\s*',
90- '(?i)\s*\[[^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\]\s*',
91- '(?i)\s*\[[^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\]\s*',
92- '(?i)\s*\[[^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\]\s*',
93- '(?i)\s*\[[^]]*(?:from|soundtrack|ost|score|theme).*?\]\s*',
94-95- -- Remove braces content
96- '(?i)\s*\{[^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\}\s*',
97- '(?i)\s*\{[^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\}\s*',
98- '(?i)\s*\{[^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\}\s*',
99- '(?i)\s*\{[^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\}\s*',
100- '(?i)\s*\{[^}]*(?:from|soundtrack|ost|score|theme).*?\}\s*',
101-102- -- Remove dash/hyphen patterns
103- '(?i)\s*[-–—]\s*[^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?$',
104- '(?i)\s*[-–—]\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
105-106- -- Remove colon patterns
107- '(?i)\s*:\s*[^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?$',
108- '(?i)\s*:\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
109- ];
110-111- pattern TEXT;
112- result_text TEXT := name_text;
113-BEGIN
114- -- Return early if input is null or empty
115- IF name_text IS NULL OR trim(name_text) = '' THEN
116- RETURN name_text;
117- END IF;
118-119- -- Remove discriminant patterns to get base name
120- FOREACH pattern IN ARRAY cleanup_patterns
121- LOOP
122- result_text := regexp_replace(result_text, pattern, ' ', 'g');
123- END LOOP;
124-125- -- Clean up extra whitespace and normalize
126- result_text := regexp_replace(trim(result_text), '\s+', ' ', 'g');
127-128- -- Remove trailing punctuation that might be left after removal
129- result_text := regexp_replace(result_text, '[,;:\-–—]\s*$', '', 'g');
130- result_text := trim(result_text);
131-132- -- Ensure we don't return an empty string
133- IF length(result_text) = 0 THEN
134- RETURN name_text;
135- END IF;
136-137- RETURN result_text;
138-END;
139-$$ LANGUAGE plpgsql IMMUTABLE;
140-141--- Enhanced function to extract discriminant specifically for editions and versions with case-insensitive matching
142-CREATE OR REPLACE FUNCTION extract_edition_discriminant(name_text TEXT) RETURNS TEXT AS $$
143-DECLARE
144- -- Focused patterns for edition/version extraction with case-insensitive flags
145- edition_patterns TEXT[] := ARRAY[
146- -- Edition patterns
147- '(?i)\(([^)]*edition[^)]*)\)',
148- '(?i)\[([^]]*edition[^]]*)\]',
149- '(?i)\{([^}]*edition[^}]*)\}',
150- '(?i)[-–—]\s*([^-–—]*edition[^-–—]*)$',
151- '(?i):\s*([^:]*edition[^:]*)$',
152-153- -- Version patterns
154- '(?i)\(([^)]*version[^)]*)\)',
155- '(?i)\[([^]]*version[^]]*)\]',
156- '(?i)\{([^}]*version[^}]*)\}',
157- '(?i)[-–—]\s*([^-–—]*version[^-–—]*)$',
158- '(?i):\s*([^:]*version[^:]*)$',
159-160- -- Remaster patterns
161- '(?i)\(([^)]*remaster[^)]*)\)',
162- '(?i)\[([^]]*remaster[^]]*)\]',
163- '(?i)\{([^}]*remaster[^}]*)\}',
164- '(?i)[-–—]\s*([^-–—]*remaster[^-–—]*)$',
165- '(?i):\s*([^:]*remaster[^:]*)$',
166-167- -- Year-based patterns
168- '(?i)\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^)]*)\)',
169- '(?i)\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^]]*)\]',
170- '(?i)\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^}]*)\}'
171- ];
172-173- pattern TEXT;
174- match_result TEXT;
175-BEGIN
176- -- Return early if input is null or empty
177- IF name_text IS NULL OR trim(name_text) = '' THEN
178- RETURN NULL;
179- END IF;
180-181- -- Try edition-specific patterns first
182- FOREACH pattern IN ARRAY edition_patterns
183- LOOP
184- SELECT substring(name_text FROM pattern) INTO match_result;
185- IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
186- match_result := trim(match_result);
187- match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
188- IF length(trim(match_result)) > 0 THEN
189- RETURN match_result;
190- END IF;
191- END IF;
192- END LOOP;
193-194- RETURN NULL;
195-END;
196-$$ LANGUAGE plpgsql IMMUTABLE;
197-198--- Update existing records with newly extracted discriminants (case-insensitive)
199-UPDATE recordings
200-SET discriminant = extract_discriminant(name)
201-WHERE discriminant IS NULL
202- AND extract_discriminant(name) IS NOT NULL;
203-204-UPDATE releases
205-SET discriminant = extract_discriminant(name)
206-WHERE discriminant IS NULL
207- AND extract_discriminant(name) IS NOT NULL;
208-209-UPDATE plays
210-SET track_discriminant = extract_discriminant(track_name)
211-WHERE track_discriminant IS NULL
212- AND extract_discriminant(track_name) IS NOT NULL;
213-214-UPDATE plays
215-SET release_discriminant = extract_discriminant(release_name)
216-WHERE release_discriminant IS NULL
217- AND release_name IS NOT NULL
218- AND extract_discriminant(release_name) IS NOT NULL;
219-220--- Update comments for the enhanced functions
221-COMMENT ON FUNCTION extract_discriminant IS 'Enhanced case-insensitive discriminant extraction supporting comprehensive edition/version patterns including parentheses, brackets, braces, dashes, and colons';
222-COMMENT ON FUNCTION get_base_name IS 'Enhanced case-insensitive base name extraction removing comprehensive discriminant patterns to enable proper grouping';
223-COMMENT ON FUNCTION extract_edition_discriminant IS 'Specialized case-insensitive function for extracting edition and version discriminants with focused patterns';
224-225--- Refresh materialized views to reflect the case-insensitive improvements
226-REFRESH MATERIALIZED VIEW mv_release_play_counts;
227-REFRESH MATERIALIZED VIEW mv_recording_play_counts;
228-229--- Update discriminant analysis view to include case-insensitive results
230-DROP VIEW IF EXISTS discriminant_analysis;
231-CREATE OR REPLACE VIEW discriminant_analysis AS
232-SELECT
233- 'recordings' as table_name,
234- name as original_name,
235- discriminant,
236- get_base_name(name) as base_name,
237- extract_discriminant(name) as extracted_discriminant,
238- extract_edition_discriminant(name) as edition_discriminant
239-FROM recordings
240-WHERE name IS NOT NULL
241-UNION ALL
242-SELECT
243- 'releases' as table_name,
244- name as original_name,
245- discriminant,
246- get_base_name(name) as base_name,
247- extract_discriminant(name) as extracted_discriminant,
248- extract_edition_discriminant(name) as edition_discriminant
249-FROM releases
250-WHERE name IS NOT NULL;
251-252-COMMENT ON VIEW discriminant_analysis IS 'Analysis view showing case-insensitive discriminant extraction results for quality assessment and debugging';