Random Dataset Generation and Linear Regression Training with Vaex

  • Share this:

Code introduction


This function first generates a random dataset using the Vaex library, then splits it into training and testing sets. It then standardizes the features using StandardScaler and trains a linear regression model.


Technology Stack : Vaex, Numpy, Pandas, Scikit-learn

Code Type : Custom function

Code Difficulty : Intermediate


                
                    
import vaex as vx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def generate_random_dataset(size=1000):
    # Generate a random dataset using Vaex
    df = vx.DataFrame({
        'A': np.random.randn(size),
        'B': np.random.randint(0, 10, size),
        'C': np.random.randint(0, 100, size)
    })
    return df

def train_model(df):
    # Split the dataset into training and testing sets using Vaex
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Standardize the features using Vaex
    scaler = StandardScaler()
    train_df[['A', 'B']] = scaler.fit_transform(train_df[['A', 'B']])
    test_df[['A', 'B']] = scaler.transform(test_df[['A', 'B']])
    
    # Dummy example of a simple model training (not using Vaex specifically)
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(train_df[['A', 'B']], train_df['C'])
    
    return model