-- =====================================================
-- TENANT FILES - VERIFICATION & ANALYSIS QUERIES
-- =====================================================
-- Purpose: Verify data integrity and analyze file usage
-- Date: 2025-01-30
-- =====================================================

-- =====================================================
-- BASIC VERIFICATION QUERIES
-- =====================================================

-- Check table exists
SELECT 
    TABLE_NAME,
    ENGINE,
    TABLE_ROWS,
    AVG_ROW_LENGTH,
    DATA_LENGTH,
    CREATE_TIME
FROM information_schema.TABLES
WHERE TABLE_SCHEMA = DATABASE()
AND TABLE_NAME = 'tenant_files';

-- Check table structure
DESCRIBE tenant_files;

-- Check indexes
SHOW INDEXES FROM tenant_files;

-- Check constraints
SELECT 
    CONSTRAINT_NAME,
    CONSTRAINT_TYPE
FROM information_schema.TABLE_CONSTRAINTS
WHERE TABLE_SCHEMA = DATABASE()
AND TABLE_NAME = 'tenant_files';

-- =====================================================
-- DATA INTEGRITY CHECKS
-- =====================================================

-- Check for NULL values in required fields
SELECT 
    'tenant_id' AS field,
    COUNT(*) AS null_count
FROM tenant_files
WHERE tenant_id IS NULL

UNION ALL

SELECT 
    'file_category',
    COUNT(*)
FROM tenant_files
WHERE file_category IS NULL

UNION ALL

SELECT 
    'original_filename',
    COUNT(*)
FROM tenant_files
WHERE original_filename IS NULL

UNION ALL

SELECT 
    'stored_filename',
    COUNT(*)
FROM tenant_files
WHERE stored_filename IS NULL

UNION ALL

SELECT 
    'file_path',
    COUNT(*)
FROM tenant_files
WHERE file_path IS NULL

UNION ALL

SELECT 
    'file_size',
    COUNT(*)
FROM tenant_files
WHERE file_size IS NULL;

-- Check for duplicate stored filenames
SELECT 
    stored_filename,
    COUNT(*) AS duplicate_count
FROM tenant_files
GROUP BY stored_filename
HAVING COUNT(*) > 1;

-- Check for invalid file sizes (negative or zero)
SELECT 
    id,
    tenant_id,
    original_filename,
    file_size
FROM tenant_files
WHERE file_size <= 0;

-- Check for orphaned files (deleted but not marked)
SELECT 
    id,
    tenant_id,
    original_filename,
    is_deleted,
    deleted_at
FROM tenant_files
WHERE (is_deleted = 1 AND deleted_at IS NULL)
   OR (is_deleted = 0 AND deleted_at IS NOT NULL);

-- =====================================================
-- STORAGE ANALYSIS QUERIES
-- =====================================================

-- Total storage by tenant
SELECT 
    tenant_id,
    COUNT(*) AS total_files,
    SUM(file_size) AS total_bytes,
    ROUND(SUM(file_size) / 1024, 2) AS total_kb,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb,
    ROUND(SUM(file_size) / 1024 / 1024 / 1024, 2) AS total_gb,
    MIN(file_size) AS smallest_file,
    MAX(file_size) AS largest_file,
    AVG(file_size) AS avg_file_size
FROM tenant_files
WHERE is_deleted = 0
GROUP BY tenant_id
ORDER BY total_bytes DESC;

-- Storage by category
SELECT 
    file_category,
    COUNT(*) AS file_count,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb,
    ROUND(AVG(file_size) / 1024, 2) AS avg_kb,
    ROUND((SUM(file_size) / (SELECT SUM(file_size) FROM tenant_files WHERE is_deleted = 0)) * 100, 2) AS percentage
FROM tenant_files
WHERE is_deleted = 0
GROUP BY file_category
ORDER BY total_mb DESC;

-- Storage by tenant and category
SELECT 
    tenant_id,
    file_category,
    COUNT(*) AS file_count,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb
FROM tenant_files
WHERE is_deleted = 0
GROUP BY tenant_id, file_category
ORDER BY tenant_id, total_mb DESC;

-- Top 10 largest files
SELECT 
    tenant_id,
    file_category,
    original_filename,
    ROUND(file_size / 1024 / 1024, 2) AS size_mb,
    upload_date,
    access_count
FROM tenant_files
WHERE is_deleted = 0
ORDER BY file_size DESC
LIMIT 10;

-- Top 10 smallest files
SELECT 
    tenant_id,
    file_category,
    original_filename,
    ROUND(file_size / 1024, 2) AS size_kb,
    upload_date
FROM tenant_files
WHERE is_deleted = 0
ORDER BY file_size ASC
LIMIT 10;

-- =====================================================
-- USAGE ANALYSIS QUERIES
-- =====================================================

-- Most accessed files
SELECT 
    tf.tenant_id,
    tf.file_category,
    tf.original_filename,
    tf.access_count,
    tf.last_accessed_at,
    ROUND(tf.file_size / 1024 / 1024, 2) AS size_mb
FROM tenant_files tf
WHERE tf.is_deleted = 0
ORDER BY tf.access_count DESC
LIMIT 20;

-- Least accessed files
SELECT 
    tenant_id,
    file_category,
    original_filename,
    access_count,
    upload_date,
    DATEDIFF(NOW(), upload_date) AS days_old
FROM tenant_files
WHERE is_deleted = 0
AND access_count = 0
ORDER BY upload_date ASC
LIMIT 20;

-- Files by upload date
SELECT 
    DATE(upload_date) AS upload_day,
    COUNT(*) AS files_uploaded,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb
FROM tenant_files
WHERE is_deleted = 0
GROUP BY DATE(upload_date)
ORDER BY upload_day DESC
LIMIT 30;

-- Files uploaded in last 7 days
SELECT 
    tenant_id,
    file_category,
    COUNT(*) AS recent_files,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb
FROM tenant_files
WHERE upload_date >= DATE_SUB(NOW(), INTERVAL 7 DAY)
AND is_deleted = 0
GROUP BY tenant_id, file_category
ORDER BY recent_files DESC;

-- Files uploaded by user
SELECT 
    uploaded_by,
    COUNT(*) AS files_uploaded,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb,
    MIN(upload_date) AS first_upload,
    MAX(upload_date) AS last_upload
FROM tenant_files
WHERE is_deleted = 0
AND uploaded_by IS NOT NULL
GROUP BY uploaded_by
ORDER BY files_uploaded DESC;

-- =====================================================
-- RELATIONSHIP ANALYSIS
-- =====================================================

-- Files by related type
SELECT 
    related_type,
    COUNT(*) AS file_count,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb
FROM tenant_files
WHERE is_deleted = 0
AND related_type IS NOT NULL
GROUP BY related_type
ORDER BY file_count DESC;

-- Files per related entity (e.g., files per student)
SELECT 
    related_type,
    related_id,
    COUNT(*) AS file_count,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb
FROM tenant_files
WHERE is_deleted = 0
AND related_type IS NOT NULL
AND related_id IS NOT NULL
GROUP BY related_type, related_id
ORDER BY file_count DESC
LIMIT 20;

-- =====================================================
-- FILE TYPE ANALYSIS
-- =====================================================

-- Files by MIME type
SELECT 
    mime_type,
    COUNT(*) AS file_count,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb,
    ROUND(AVG(file_size) / 1024, 2) AS avg_kb
FROM tenant_files
WHERE is_deleted = 0
GROUP BY mime_type
ORDER BY file_count DESC;

-- Files by extension (derived from original filename)
SELECT 
    LOWER(SUBSTRING_INDEX(original_filename, '.', -1)) AS extension,
    COUNT(*) AS file_count,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb
FROM tenant_files
WHERE is_deleted = 0
AND original_filename LIKE '%.%'
GROUP BY extension
ORDER BY file_count DESC;

-- =====================================================
-- DELETED FILES ANALYSIS
-- =====================================================

-- Deleted files statistics
SELECT 
    COUNT(*) AS deleted_files,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS deleted_mb,
    MIN(deleted_at) AS first_deletion,
    MAX(deleted_at) AS last_deletion
FROM tenant_files
WHERE is_deleted = 1;

-- Deleted files by tenant
SELECT 
    tenant_id,
    COUNT(*) AS deleted_files,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS deleted_mb
FROM tenant_files
WHERE is_deleted = 1
GROUP BY tenant_id
ORDER BY deleted_files DESC;

-- Recently deleted files (last 30 days)
SELECT 
    tenant_id,
    file_category,
    original_filename,
    ROUND(file_size / 1024 / 1024, 2) AS size_mb,
    deleted_at,
    deleted_by
FROM tenant_files
WHERE is_deleted = 1
AND deleted_at >= DATE_SUB(NOW(), INTERVAL 30 DAY)
ORDER BY deleted_at DESC;

-- =====================================================
-- ACCESS LOG ANALYSIS
-- =====================================================

-- Access statistics
SELECT 
    COUNT(*) AS total_accesses,
    COUNT(DISTINCT file_id) AS unique_files_accessed,
    COUNT(DISTINCT user_id) AS unique_users,
    COUNT(DISTINCT tenant_id) AS unique_tenants
FROM tenant_file_access_log;

-- Access by type
SELECT 
    access_type,
    COUNT(*) AS access_count,
    COUNT(DISTINCT file_id) AS unique_files,
    COUNT(DISTINCT user_id) AS unique_users
FROM tenant_file_access_log
GROUP BY access_type;

-- Most active users
SELECT 
    user_id,
    COUNT(*) AS total_accesses,
    COUNT(DISTINCT file_id) AS files_accessed,
    MIN(accessed_at) AS first_access,
    MAX(accessed_at) AS last_access
FROM tenant_file_access_log
GROUP BY user_id
ORDER BY total_accesses DESC
LIMIT 10;

-- Access activity by hour
SELECT 
    HOUR(accessed_at) AS hour,
    COUNT(*) AS access_count
FROM tenant_file_access_log
GROUP BY HOUR(accessed_at)
ORDER BY hour;

-- =====================================================
-- PERFORMANCE CHECKS
-- =====================================================

-- Check index effectiveness
SELECT 
    TABLE_NAME,
    INDEX_NAME,
    SEQ_IN_INDEX,
    COLUMN_NAME,
    CARDINALITY,
    INDEX_TYPE
FROM information_schema.STATISTICS
WHERE TABLE_SCHEMA = DATABASE()
AND TABLE_NAME = 'tenant_files'
ORDER BY TABLE_NAME, INDEX_NAME, SEQ_IN_INDEX;

-- Table size and row count
SELECT 
    TABLE_NAME,
    ENGINE,
    TABLE_ROWS,
    AVG_ROW_LENGTH,
    ROUND((DATA_LENGTH + INDEX_LENGTH) / 1024 / 1024, 2) AS total_mb,
    ROUND(DATA_LENGTH / 1024 / 1024, 2) AS data_mb,
    ROUND(INDEX_LENGTH / 1024 / 1024, 2) AS index_mb
FROM information_schema.TABLES
WHERE TABLE_SCHEMA = DATABASE()
AND TABLE_NAME IN ('tenant_files', 'tenant_file_access_log', 'tenant_file_versions');

-- =====================================================
-- MAINTENANCE QUERIES
-- =====================================================

-- Files ready for archival (not accessed in 365 days)
SELECT 
    tenant_id,
    file_category,
    original_filename,
    ROUND(file_size / 1024 / 1024, 2) AS size_mb,
    last_accessed_at,
    DATEDIFF(NOW(), COALESCE(last_accessed_at, upload_date)) AS days_inactive
FROM tenant_files
WHERE is_deleted = 0
AND COALESCE(last_accessed_at, upload_date) < DATE_SUB(NOW(), INTERVAL 365 DAY)
ORDER BY days_inactive DESC;

-- Duplicate files (by hash)
SELECT 
    file_hash,
    COUNT(*) AS duplicate_count,
    GROUP_CONCAT(id) AS file_ids,
    ROUND(SUM(file_size) / 1024 / 1024, 2) AS total_mb_duplicates
FROM tenant_files
WHERE file_hash IS NOT NULL
AND is_deleted = 0
GROUP BY file_hash
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC;

-- Large files that could be compressed
SELECT 
    tenant_id,
    file_category,
    original_filename,
    mime_type,
    ROUND(file_size / 1024 / 1024, 2) AS size_mb
FROM tenant_files
WHERE is_deleted = 0
AND file_size > 10485760  -- Files larger than 10MB
AND mime_type IN ('application/pdf', 'image/jpeg', 'image/png')
ORDER BY file_size DESC;

-- =====================================================
-- SUMMARY DASHBOARD QUERY
-- =====================================================

-- Overall summary
SELECT 
    (SELECT COUNT(*) FROM tenant_files WHERE is_deleted = 0) AS total_active_files,
    (SELECT COUNT(*) FROM tenant_files WHERE is_deleted = 1) AS total_deleted_files,
    (SELECT ROUND(SUM(file_size) / 1024 / 1024 / 1024, 2) FROM tenant_files WHERE is_deleted = 0) AS total_gb,
    (SELECT COUNT(DISTINCT tenant_id) FROM tenant_files WHERE is_deleted = 0) AS active_tenants,
    (SELECT COUNT(*) FROM tenant_file_access_log) AS total_accesses,
    (SELECT COUNT(*) FROM tenant_files WHERE upload_date >= DATE_SUB(NOW(), INTERVAL 7 DAY)) AS files_last_7_days,
    (SELECT COUNT(*) FROM tenant_files WHERE upload_date >= DATE_SUB(NOW(), INTERVAL 30 DAY)) AS files_last_30_days;

-- =====================================================
-- END OF VERIFICATION QUERIES
-- =====================================================

SELECT '=== VERIFICATION COMPLETE ===' AS status;
SELECT 'All queries executed successfully' AS message;

