Query Details

Column Size Analysis

Query

// Column Size Analysis - Identify Oversized Columns per Table
// Analyzes each of the top ingesting tables individually, pivots columns
// with narrow(), and finds columns consuming >30% of average row size.
// These are prime candidates for project-away in DCR transforms.
// =====================================================================

let _SampleSize = 50000;
let _analyzeSecurityEvent = SecurityEvent | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "SecurityEvent";
let _analyzeSigninLogs = SigninLogs | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "SigninLogs";
let _analyzeAADNonInteractive = AADNonInteractiveUserSignInLogs | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "AADNonInteractiveUserSignInLogs";
let _analyzeCSL = CommonSecurityLog | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "CommonSecurityLog";
let _analyzeSyslog = Syslog | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "Syslog";
let _analyzeAuditLogs = AuditLogs | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "AuditLogs";
let _analyzeAzureActivity = AzureActivity | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "AzureActivity";
let _analyzeOfficeActivity = OfficeActivity | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "OfficeActivity";
let _analyzeHeartbeat = Heartbeat | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "Heartbeat";
let _analyzeEvent = Event | where TimeGenerated > ago(1d) | take _SampleSize | evaluate narrow() | where isnotempty(tostring(Value)) | summarize AvgLenBytes = round(avg(toreal(strlen(tostring(Value)))), 0), MaxLenBytes = max(strlen(tostring(Value))) by Column | where AvgLenBytes > 0 | extend TableName = "Event";
let _allColumns = union
    _analyzeSecurityEvent, _analyzeSigninLogs, _analyzeAADNonInteractive,
    _analyzeCSL, _analyzeSyslog, _analyzeAuditLogs,
    _analyzeAzureActivity, _analyzeOfficeActivity, _analyzeHeartbeat, _analyzeEvent;
_allColumns
| join kind=inner (
    _allColumns | summarize TotalAvgLen = sum(AvgLenBytes) by TableName
) on TableName
| extend PctOfRowSize = round(AvgLenBytes / toreal(TotalAvgLen) * 100, 1)
| where PctOfRowSize > 30
| extend Recommendation = case(
    PctOfRowSize > 50, "CRITICAL: >50% of row size - Use project-away in DCR transform",
    "Medium: >30% of row size - Study to use project-away in DCR transform"
)
| project TableName, Column, AvgLenBytes, MaxLenBytes, PctOfRowSize, Recommendation
| order by TableName asc, PctOfRowSize desc

Explanation

This query is designed to analyze data tables to identify columns that are consuming a significant portion of the average row size, which could be optimized to improve data storage and processing efficiency. Here's a simple breakdown of what the query does:

  1. Sample Size: It sets a sample size of 50,000 records to analyze from each table.

  2. Table Analysis: It examines several tables (e.g., SecurityEvent, SigninLogs, etc.) for the past day. For each table:

    • It selects a sample of records.
    • It uses the narrow() function to pivot columns, making it easier to analyze each column's data.
    • It calculates the average and maximum length in bytes of the data in each column.
    • It filters out columns with an average length greater than zero and records the table name.
  3. Combine Results: It combines the results from all analyzed tables into a single dataset.

  4. Calculate Percentage: It calculates what percentage of the total average row size each column's average length represents.

  5. Identify Oversized Columns: It identifies columns where this percentage is greater than 30%, marking them as oversized.

  6. Recommendations: It provides recommendations based on the percentage:

    • If a column takes up more than 50% of the row size, it's marked as "CRITICAL" and suggests using "project-away" in Data Collection Rule (DCR) transforms.
    • If a column takes up more than 30% but less than 50%, it's marked as "Medium" and suggests studying the column for potential optimization.
  7. Output: It outputs a list of oversized columns, sorted by table name and percentage of row size, along with the recommendation for each.

This analysis helps in identifying columns that could be optimized or removed to improve data efficiency in storage and processing.

Details

David Alonso profile picture

David Alonso

Released: April 8, 2026

Tables

SecurityEventSigninLogsAADNonInteractiveUserSignInLogsCommonSecurityLogSyslogAuditLogsAzureActivityOfficeActivityHeartbeatEvent

Keywords

SecurityEventSigninLogsAADNonInteractiveUserSignInLogsCommonSecurityLogSyslogAuditLogsAzureActivityOfficeActivityHeartbeatEvent

Operators

letwhereagotakeevaluatenarrowisnotemptytostringsummarizeroundavgtorealstrlenmaxbyextendunionjoinkindoncaseprojectorder byascdesc

Actions