Posts

Showing posts from September, 2025

10

import numpy as np import pandas as pd # Define the regression model coefficients intercept = 2.5 beta_age = -0.03 beta_condition = 0.5 # Create a DataFrame for two patients: one with and one without a chronic condition data = pd.DataFrame({ 'Age': [60, 60], 'Condition': [1, 0]   # 1 = has chronic condition, 0 = does not }) # Calculate log(λ) using the model data['log_lambda'] = intercept + beta_age * data['Age'] + beta_condition * data['Condition'] # Exponentiate to get λ (expected number of visits) data['lambda'] = np.exp(data['log_lambda']) # Calculate the percentage increase due to chronic condition increase_pct = ((data.loc[0, 'lambda'] - data.loc[1, 'lambda']) / data.loc[1, 'lambda']) * 100 # Display results print(data[['Age', 'Condition', 'lambda']]) print(f"\nIncrease in expected visits due to chronic condition: {increase_pct:.2f}%...

6

import statsmodels.api as sm x1=120 n1=1000 x2=150 n2=1200 count=[x1, x2] nobs=[n1, n2] z_stat,p_value=sm.stats.proportions_ztest(count, nobs) print(f"Z-statistic: {z_stat:.4f}") print(f"P-value: {p_value:.4f}") if p_value<0.05:      print("Result: Reject the null hypothesis. There is a statistically significant difference in conversion rates.") else:      print("Result: Fail to reject the null hypothesis. No significant difference in conversion rates.")

9

import pandas as pd import numpy as np import matplotlib.pyplot as plt import statsmodels.api as sm # Sample data: Housing Prices and Square Footage data = { 'sqft': [1500, 1700, 1900, 2000, 2100, 2300, 2500], 'price': [200000, 220000, 240000, 260000, 290000, 320000, 350000] } df = pd.DataFrame(data) # Add the spline feature: (sqft - 2000)+ df['spline'] = np.where(df['sqft'] > 2000, df['sqft'] - 2000, 0) # Design matrix X = sm.add_constant(df[['sqft', 'spline']])  # Intercept + sqft + spline term y = df['price'] # Fit the model model = sm.OLS(y, X).fit() # Output the results print(model.summary())

8

import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf # Sample data data = pd.DataFrame({ 'Salary': [40000, 50000, 60000, 70000, 80000, 55000, 65000, 75000, 85000], 'Education': ['High School', 'Bachelor\'s', 'Master\'s', 'High School', 'Bachelor\'s', 'Master\'s', 'High School', 'Bachelor\'s', 'Master\'s'], 'Experience': [2, 3, 4, 5, 6, 7, 3, 5, 8] }) # Convert Education to categorical with High School as base data['Education'] = pd.Categorical(data['Education'], categories=['High School', "Bachelor's", "Master's"]) # Fit regression model model = smf.ols('Salary ~ C(Education) + Experience', data=data).fit() print(model.summary())

7

# Given values mean_a = 1000 std_a = 100 n_a = 30 mean_b = 950 std_b = 120 n_b = 30 # Step 1: Compute the difference in means mean_diff = mean_a - mean_b # Step 2: Compute the standard error (Welch's formula for unequal variances) se = ((std_a ** 2) / n_a + (std_b ** 2) / n_b) ** 0.5 # Step 3: Compute the t-statistic t_stat = mean_diff / se # Step 4: Compute degrees of freedom using Welch–Satterthwaite approximation df_numerator = ((std_a ** 2) / n_a + (std_b ** 2) / n_b) ** 2 df_denominator = (( (std_a ** 2) / n_a ) ** 2) / (n_a - 1) + (( (std_b ** 2) / n_b ) ** 2) / (n_b - 1) df = df_numerator / df_denominator # Print results print(f"T-statistic: {t_stat:.4f}") print(f"Approximate Degrees of Freedom: {df:.2f}") # Interpretation guide (manual comparison needed) # For example: Critical t-value at df ≈ 55, α=0.05 (two-tailed) ≈ ±2.004 # Decision if abs(t_stat) > 2.004:      print("Result: Reject H₀ → Signi...