import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')

df = pd.read_csv(r"C:\Users\bvenn\OneDrive\Desktop\Python Projekte\Showcases\Tableau_projekte\Chocolate_sales\dataset\Schoki_sales.csv")
df['date'] = pd.to_datetime(df['date'])
df['revenue_per_box'] = (df['sales_amount'] / df['boxes_shipped']).round(2)

print(f'Shape: {df.shape}')
print(f'Date Range: {df["date"].min().date()} to {df["date"].max().date()}')
print(f'Salespersons: {df["sales_person"].nunique()} | Countries: {df["country"].nunique()} | Products: {df["product"].nunique()}')
print(f'\nMissing Values: {df.isnull().sum().sum()} | Duplicates: {df.duplicated().sum()} | Negative Values: {(df["sales_amount"] <= 0).sum()}')
df.head()

Shape: (3282, 7)
Date Range: 2022-01-03 to 2024-08-31
Salespersons: 25 | Countries: 6 | Products: 22

Missing Values: 0 | Duplicates: 0 | Negative Values: 0


# ===============================
# Descriptive Statistics
# ===============================
print('=== Revenue per Box – Overall Distribution ===')
print(f'Minimum:  ${df["revenue_per_box"].min():,.2f}')
print(f'Median:   ${df["revenue_per_box"].median():,.2f}')
print(f'Mean:     ${df["revenue_per_box"].mean():,.2f}')
print(f'Maximum:  ${df["revenue_per_box"].max():,.2f}')
print(f'\nRPB > $500:  {(df["revenue_per_box"] > 500).sum()} transactions '
      f'({(df["revenue_per_box"] > 500).mean()*100:.1f}%)')
print(f'RPB < $1:    {(df["revenue_per_box"] < 1).sum()} transactions '
      f'({(df["revenue_per_box"] < 1).mean()*100:.1f}%)')

# ===============================
# Prepare extreme values
# ===============================
top5 = df.nlargest(5, 'revenue_per_box')[[
    'product', 'country', 'boxes_shipped', 'sales_amount', 'revenue_per_box'
]]

bottom5 = df.nsmallest(5, 'revenue_per_box')[[
    'product', 'country', 'boxes_shipped', 'sales_amount', 'revenue_per_box'
]]

# ===============================
# Create custom grid layout
# ===============================
fig = plt.figure(figsize=(18, 8))
gs = fig.add_gridspec(2, 2, width_ratios=[2, 1])

# ---- (1) Boxplot spans full left column ----
ax_box = fig.add_subplot(gs[:, 0])
sns.boxplot(data=df, x='product', y='revenue_per_box', ax=ax_box)
ax_box.set_xticklabels(ax_box.get_xticklabels(), rotation=90)
ax_box.set_title('RPB Distribution by Product')
ax_box.set_ylabel('Revenue per Box ($)')
ax_box.set_xlabel('Product')

# ---- (2) Top 5 Highest (top-right) ----
ax_top = fig.add_subplot(gs[0, 1])
ax_top.barh(range(5), top5['revenue_per_box'].values)
ax_top.set_yticks(range(5))
ax_top.set_yticklabels(
    [f"{row['product']} ({row['country']})" for _, row in top5.iterrows()]
)
ax_top.set_title('Top 5 Highest RPB')
ax_top.set_xlabel('Revenue per Box ($)')
ax_top.invert_yaxis()

# ---- (3) Top 5 Lowest (bottom-right) ----
ax_bottom = fig.add_subplot(gs[1, 1])
ax_bottom.barh(range(5), bottom5['revenue_per_box'].values)
ax_bottom.set_yticks(range(5))
ax_bottom.set_yticklabels(
    [f"{row['product']} ({row['country']})" for _, row in bottom5.iterrows()]
)
ax_bottom.set_title('Top 5 Lowest RPB')
ax_bottom.set_xlabel('Revenue per Box ($)')
ax_bottom.invert_yaxis()

plt.tight_layout()
plt.show()

=== Revenue per Box – Overall Distribution ===
Minimum:  $0.01
Median:   $38.19
Mean:     $111.33
Maximum:  $4,692.00

RPB > $500:  138 transactions (4.2%)
RPB < $1:    56 transactions (1.7%)

C:\Users\bvenn\AppData\Local\Temp\ipykernel_9728\651271402.py:34: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax_box.set_xticklabels(ax_box.get_xticklabels(), rotation=90)


rpb_by_product = df.groupby('product')['revenue_per_box'].agg(['mean', 'median', 'std', 'count'])
rpb_by_product['cov'] = (rpb_by_product['std'] / rpb_by_product['mean']).round(2)
rpb_by_product = rpb_by_product.sort_values('cov', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#e74c3c' if x > 2 else '#f39c12' if x > 1.5 else '#f1c40f' for x in rpb_by_product['cov']]
bars = ax.barh(rpb_by_product.index, rpb_by_product['cov'], color=colors)
ax.axvline(x=0.5, color='green', linestyle='--', linewidth=2, label='Expected FMCG range (0.1–0.5)')
ax.axvline(x=1.0, color='orange', linestyle='--', linewidth=2, label='CoV = 1 (std > mean)')
ax.set_xlabel('Coefficient of Variation')
ax.set_title('Pricing Consistency by Product (CoV)')
ax.legend()
ax.invert_yaxis()
plt.tight_layout()
plt.show()

print(f'Lowest CoV:  {rpb_by_product["cov"].min()} ({rpb_by_product["cov"].idxmin()})')
print(f'Highest CoV: {rpb_by_product["cov"].max()} ({rpb_by_product["cov"].idxmax()})')
print(f'\nProducts with CoV > 1: {(rpb_by_product["cov"] > 1).sum()} out of {len(rpb_by_product)}')

Lowest CoV:  1.43 (Almond Choco)
Highest CoV: 3.31 (Mint Chip Choco)

Products with CoV > 1: 22 out of 22


fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Product Pareto
prod_rev = df.groupby('product')['sales_amount'].sum().sort_values(ascending=False)
prod_rev_pct = (prod_rev / prod_rev.sum() * 100)
prod_rev_cum = prod_rev_pct.cumsum()

axes[0].bar(range(len(prod_rev)), prod_rev_pct.values, color='#2196F3', alpha=0.7)
ax2 = axes[0].twinx()
ax2.plot(range(len(prod_rev)), prod_rev_cum.values, 'r-o', markersize=4)
ax2.axhline(y=80, color='red', linestyle='--', alpha=0.5)
axes[0].set_xticks(range(len(prod_rev)))
axes[0].set_xticklabels(prod_rev.index, rotation=90)
axes[0].set_ylabel('Revenue Share (%)')
ax2.set_ylabel('Cumulative %')
axes[0].set_title('Product Revenue Concentration')

n_80_prod = (prod_rev_cum <= 80).sum() + 1

# Salesperson Pareto
sp_rev = df.groupby('sales_person')['sales_amount'].sum().sort_values(ascending=False)
sp_rev_pct = (sp_rev / sp_rev.sum() * 100)
sp_rev_cum = sp_rev_pct.cumsum()

axes[1].bar(range(len(sp_rev)), sp_rev_pct.values, color='#4CAF50', alpha=0.7)
ax3 = axes[1].twinx()
ax3.plot(range(len(sp_rev)), sp_rev_cum.values, 'r-o', markersize=4)
ax3.axhline(y=80, color='red', linestyle='--', alpha=0.5)
axes[1].set_xticks(range(len(sp_rev)))
axes[1].set_xticklabels(sp_rev.index, rotation=90)
axes[1].set_ylabel('Revenue Share (%)')
ax3.set_ylabel('Cumulative %')
axes[1].set_title('Salesperson Revenue Concentration')

n_80_sp = (sp_rev_cum <= 80).sum() + 1

plt.tight_layout()
plt.show()

print(f'Products needed for 80% revenue: {n_80_prod} of {len(prod_rev)} ({n_80_prod/len(prod_rev)*100:.0f}%) – expected: ~20%')
print(f'Salespersons needed for 80% revenue: {n_80_sp} of {len(sp_rev)} ({n_80_sp/len(sp_rev)*100:.0f}%) – expected: ~20%')

Products needed for 80% revenue: 17 of 22 (77%) – expected: ~20%
Salespersons needed for 80% revenue: 19 of 25 (76%) – expected: ~20%


sp_country = df.groupby('sales_person')['country'].nunique()

print(f'Number of markets: {df["country"].nunique()}')
print(f'Countries per salesperson: min={sp_country.min()}, max={sp_country.max()}')
print(f'\n-> Every single one of the 25 salespersons sells in all 6 countries.')
print('   In a real global company, you\'d expect regional coverage — not everyone selling everywhere.')

Number of markets: 6
Countries per salesperson: min=6, max=6

-> Every single one of the 25 salespersons sells in all 6 countries.
   In a real global company, you'd expect regional coverage — not everyone selling everywhere.


monthly = df.groupby([df['date'].dt.year.rename('year'), df['date'].dt.month.rename('month')]).agg(
    transactions=('sales_amount', 'count'),
    revenue=('sales_amount', 'sum')
)

tx_pivot = monthly['transactions'].unstack(level=0)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

tx_pivot.plot(kind='bar', ax=axes[0], colormap='Set2')
axes[0].set_title('Transactions per Month')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Number of Transactions')
axes[0].legend(title='Year')

months_present = df.groupby([df['date'].dt.year.rename('year'), df['date'].dt.month.rename('month')]).size().reset_index(name='count')
heatmap_data = months_present.pivot(index='year', columns='month', values='count').reindex(columns=range(1,13))
sns.heatmap(heatmap_data, annot=True, fmt='.0f', cmap='YlOrRd', ax=axes[1], 
            xticklabels=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'],
            linewidths=0.5, cbar_kws={'label': 'Transactions'})
axes[1].set_title('Transaction Heatmap by Month & Year')
axes[1].set_ylabel('Year')

plt.tight_layout()
plt.show()

print('=== Transactions per Month (Year-over-Year) ===')
print(tx_pivot.to_string())
print(f'\n-> September through December: completely empty. No Q4 business.')
print(f'-> Transaction counts per month are IDENTICAL across all 3 years.')

=== Transactions per Month (Year-over-Year) ===
year   2022  2023  2024
month                  
1       154   154   154
2       110   110   110
3       131   131   131
4       118   118   118
5       135   135   135
6       163   163   163
7       149   149   149
8       134   134   134

-> September through December: completely empty. No Q4 business.
-> Transaction counts per month are IDENTICAL across all 3 years.


samples = [
    ('White Choc', 'Canada'),
    ('Eclairs', 'Australia'),
    ('Drinking Coco', 'USA'),
    ('Mint Chip Choco', 'New Zealand'),
]

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, (product, country) in enumerate(samples):
    subset = df[(df['product'] == product) & (df['country'] == country)].sort_values('date')
    subset['month_day'] = subset['date'].dt.strftime('%m-%d')
    
    pivot_boxes = subset.pivot_table(index='month_day', columns=subset['date'].dt.year, values='boxes_shipped', aggfunc='first')
    
    ax = axes[idx]
    pivot_boxes.plot(kind='bar', ax=ax, colormap='Set2', width=0.8)
    ax.set_title(f'{product} | {country}', fontsize=12, fontweight='bold')
    ax.set_xlabel('Date (MM-DD)')
    ax.set_ylabel('Boxes Shipped')
    ax.legend(title='Year')
    ax.tick_params(axis='x', rotation=45)
    
plt.suptitle('Boxes Shipped: Same dates, same patterns — three years running', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()


# Systematic analysis: How many exact matches exist across all product-country combos?
all_results = []

for product in df['product'].unique():
    for country in df['country'].unique():
        subset = df[(df['product'] == product) & (df['country'] == country)].sort_values('date')
        if len(subset) == 0:
            continue
        
        subset['month_day'] = subset['date'].dt.strftime('%m-%d')
        pivot = subset.pivot_table(index='month_day', columns=subset['date'].dt.year, values='boxes_shipped', aggfunc='first')
        
        years = sorted(pivot.columns)
        if len(years) < 3:
            continue
            
        complete = pivot.dropna()
        if len(complete) == 0:
            continue
            
        exact_3y = (complete[years[0]] == complete[years[1]]) & (complete[years[1]] == complete[years[2]])
        
        pivot_rpb = subset.pivot_table(index='month_day', columns=subset['date'].dt.year, values='revenue_per_box', aggfunc='first')
        complete_rpb = pivot_rpb.dropna()
        if len(complete_rpb) > 0:
            monoton_rising = ((complete_rpb[years[2]] > complete_rpb[years[1]]) & (complete_rpb[years[1]] > complete_rpb[years[0]])).mean()
        else:
            monoton_rising = np.nan
        
        all_results.append({
            'product': product,
            'country': country,
            'n_dates': len(complete),
            'exact_matches_3y': exact_3y.sum(),
            'exact_match_pct': (exact_3y.mean() * 100).round(1),
            'rpb_monoton_rising_pct': (monoton_rising * 100).round(1) if not np.isnan(monoton_rising) else np.nan
        })

results_df = pd.DataFrame(all_results)

print('=== Deterministic Pattern Analysis ===')
print(f'Product-country combinations analyzed: {len(results_df)}')
print(f'Combinations with exact box matches (all 3 years): {(results_df["exact_matches_3y"] > 0).sum()} of {len(results_df)}')
print(f'Average exact match rate: {results_df["exact_match_pct"].mean():.1f}%')
print(f'\nRPB monotonically rising (2022 < 2023 < 2024):')
print(f'Average: {results_df["rpb_monoton_rising_pct"].mean():.1f}% of data points per combination')

print(f'\n=== Top 10 Combinations by Exact Match Rate ===')
print(results_df.nlargest(10, 'exact_match_pct')[['product', 'country', 'n_dates', 'exact_matches_3y', 'exact_match_pct']].to_string(index=False))

=== Deterministic Pattern Analysis ===
Product-country combinations analyzed: 132
Combinations with exact box matches (all 3 years): 37 of 132
Average exact match rate: 3.9%

RPB monotonically rising (2022 < 2023 < 2024):
Average: 48.0% of data points per combination

=== Top 10 Combinations by Exact Match Rate ===
             product     country  n_dates  exact_matches_3y  exact_match_pct
          White Choc      Canada        9                 4            44.40
Choco Coated Almonds         USA        4                 1            25.00
             Eclairs   Australia       10                 2            20.00
Choco Coated Almonds   Australia        5                 1            20.00
 Baker's Choco Chips      Canada        5                 1            20.00
     Mint Chip Choco          UK        6                 1            16.70
     99% Dark & Pure      Canada        6                 1            16.70
        Orange Choco          UK        6                 1            16.70
      70% Dark Bites New Zealand        6                 1            16.70
           Milk Bars      Canada        7                 1            14.30


# Deep dive: White Choc / Canada — the most striking example
wc_ca = df[(df['product'] == 'White Choc') & (df['country'] == 'Canada')].sort_values('date')
wc_ca['month_day'] = wc_ca['date'].dt.strftime('%m-%d')

pivot_detail = wc_ca.pivot_table(
    index='month_day', 
    columns=wc_ca['date'].dt.year, 
    values=['boxes_shipped', 'revenue_per_box'], 
    aggfunc='first'
)

print('=== White Choc | Canada — Year-over-Year Comparison ===')
print(pivot_detail.to_string())
print('\n-> Boxes shipped on Mar 29: exactly 1, 1, 1 — three years in a row')
print('-> Boxes shipped on Mar 22: exactly 3, 3, 3 — three years in a row')
print('-> Boxes shipped on Jul 11: exactly 4, 4, 4 — three years in a row')
print('-> RPB increases slightly each year: A built-in inflation factor (~3-5%)')

=== White Choc | Canada — Year-over-Year Comparison ===
          boxes_shipped           revenue_per_box                  
date               2022 2023 2024            2022     2023     2024
month_day                                                          
01-25               136  132  142           34.02    35.75    36.83
02-14                29   31   29          140.24   143.06   150.62
03-03                72   74   71           46.96    45.92    53.51
03-22                 3    3    3          140.00   160.33   156.00
03-29                 1    1    1        4,291.00 4,692.00 4,590.00
04-05               268  270  292           21.97    24.42    23.79
07-11                 4    4    4        1,646.75 1,746.25 1,746.75
07-15               173  185  179           53.61    53.82    59.07
08-11                15   15   15          504.00   512.33   571.87

-> Boxes shipped on Mar 29: exactly 1, 1, 1 — three years in a row
-> Boxes shipped on Mar 22: exactly 3, 3, 3 — three years in a row
-> Boxes shipped on Jul 11: exactly 4, 4, 4 — three years in a row
-> RPB increases slightly each year: A built-in inflation factor (~3-5%)


# Structural break: At what volume does RPB stabilize?
cutoffs = range(5, 105, 5)
results = []
for cutoff in cutoffs:
    subset = df[df['boxes_shipped'] >= cutoff]
    cov = subset['revenue_per_box'].std() / subset['revenue_per_box'].mean()
    results.append({'min_boxes': cutoff, 'n': len(subset), 'rpb_mean': subset['revenue_per_box'].mean().round(2), 'rpb_cov': cov.round(4)})

stability = pd.DataFrame(results)

fig, ax1 = plt.subplots(figsize=(10, 5))
ax2 = ax1.twinx()

ax1.plot(stability['min_boxes'], stability['rpb_cov'], 'o-', color='#e74c3c', linewidth=2, label='CoV (pricing dispersion)')
ax2.plot(stability['min_boxes'], stability['rpb_mean'], 's-', color='#2196F3', linewidth=2, label='Mean RPB ($)')

ax1.axvspan(5, 30, alpha=0.1, color='red', label='Unstable zone')
ax1.axvline(x=30, color='gray', linestyle='--', alpha=0.5)

ax1.set_xlabel('Minimum Boxes Shipped (Cutoff)', fontsize=12)
ax1.set_ylabel('Coefficient of Variation', color='#e74c3c', fontsize=12)
ax2.set_ylabel('Mean RPB ($)', color='#2196F3', fontsize=12)
ax1.set_title('RPB Stability by Volume Cutoff — Where Does Pricing Stabilize?', fontsize=13)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
ax1.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Volume tier comparison
df['volume_tier'] = pd.cut(
    df['boxes_shipped'],
    bins=[0, 20, 50, 100, 200, 400, float('inf')],
    labels=['Micro (<20)', 'Small (20-50)', 'Medium (50-100)', 'Large (100-200)', 'Bulk (200-400)', 'Mega (400+)']
)

volume_stats = df.groupby('volume_tier', observed=True)['revenue_per_box'].agg(['mean', 'std', 'count'])
volume_stats['cov'] = (volume_stats['std'] / volume_stats['mean']).round(2)
print('=== RPB by Volume Tier ===')
print(volume_stats)
print(f'\n-> Micro tier: Mean RPB ${volume_stats.loc["Micro (<20)", "mean"]:.0f} vs. Mega tier: ${volume_stats.loc["Mega (400+)", "mean"]:.0f} — a {volume_stats.loc["Micro (<20)", "mean"]/volume_stats.loc["Mega (400+)", "mean"]:.0f}x spread')

=== RPB by Volume Tier ===
                  mean    std  count  cov
volume_tier                              
Micro (<20)     876.18 825.36    204 0.94
Small (20-50)   190.44 146.95    360 0.77
Medium (50-100)  83.23  68.93    638 0.83
Large (100-200)  41.76  31.94   1061 0.76
Bulk (200-400)   22.08  17.83    821 0.81
Mega (400+)      12.98   9.53    198 0.73

-> Micro tier: Mean RPB $876 vs. Mega tier: $13 — a 68x spread


findings = pd.DataFrame({
    'Finding': [
        'RPB spread $0.01 – $4,692 for chocolate',
        'CoV > 1.4 across all 22 products',
        'No Pareto effect (77% of products needed for 80% revenue)',
        'All 25 salespersons active in all 6 countries',
        'Q4 completely missing (Sep–Dec)',
        'Identical transaction counts per month across 3 years',
        'Deterministic date templates with noise overlay',
        'Built-in YoY inflation factor (~3-5%)'
    ],
    'Severity': ['Critical', 'Critical', 'High', 'High', 'High', 'Critical', 'Critical', 'Medium'],
    'Category': [
        'Pricing', 'Pricing', 'Concentration', 'Sales Structure',
        'Completeness', 'Structure', 'Structure', 'Structure'
    ]
})

print('=' * 80)
print('  VERDICT: Synthetically Generated Dataset')
print('=' * 80)
print()
print(findings.to_string(index=False))
print()
print('-' * 80)
print('The data shows none of the characteristics of a real business.')
print('Instead, it exhibits all hallmarks of a rule-based generator:')
print('fixed templates, deterministic noise, missing dimensional interactions,')
print('and a suspiciously uniform distribution across all segments.')
print('-' * 80)

================================================================================
  VERDICT: Synthetically Generated Dataset
================================================================================

                                                  Finding Severity        Category
                  RPB spread $0.01 – $4,692 for chocolate Critical         Pricing
                         CoV > 1.4 across all 22 products Critical         Pricing
No Pareto effect (77% of products needed for 80% revenue)     High   Concentration
            All 25 salespersons active in all 6 countries     High Sales Structure
                          Q4 completely missing (Sep–Dec)     High    Completeness
    Identical transaction counts per month across 3 years Critical       Structure
          Deterministic date templates with noise overlay Critical       Structure
                    Built-in YoY inflation factor (~3-5%)   Medium       Structure

--------------------------------------------------------------------------------
The data shows none of the characteristics of a real business.
Instead, it exhibits all hallmarks of a rule-based generator:
fixed templates, deterministic noise, missing dimensional interactions,
and a suspiciously uniform distribution across all segments.
--------------------------------------------------------------------------------

	sales_person	country	product	date	sales_amount	boxes_shipped	revenue_per_box
0	Jehu Rudeforth	UK	Mint Chip Choco	2022-01-04	5320	180	29.56
1	Van Tuxwell	India	85% Dark Bars	2022-08-01	7896	94	84.00
2	Gigi Bohling	India	Peanut Butter Cubes	2022-07-07	4501	91	49.46
3	Jan Morforth	Australia	Peanut Butter Cubes	2022-04-27	12726	342	37.21
4	Jehu Rudeforth	UK	Peanut Butter Cubes	2022-02-24	13685	184	74.38

"Is This Real?" – A Data Plausibility Check on the Chocolate Sales Dataset¶

1. Setup & First Look¶

2. Red Flag #1 – Pricing All Over the Place¶

Coefficient of Variation – Pricing Consistency Check¶

3. Red Flag #2 – Suspiciously Uniform Distribution¶

4. Red Flag #3 – Where's Christmas?¶

5. The Smoking Gun – Deterministic Patterns¶

Reverse-Engineering the Generator¶

6. The Volume-Price Distortion¶

7. Verdict¶

Conclusion¶