Import CountVectorizer from sklearn.feature_extraction.text and train_test_split from sklearn.model_selection. Create a

动态未结 0 493

小小草 LV1 2023年9月23日 23:40 发表

# Import the necessary modules from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split # Print the head of df print(df.head()) # Create a series to store the labels: y y = df.label # Create training and test sets X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53) # Initialize a CountVectorizer object: count_vectorizer count_vectorizer = CountVectorizer(stop_words='english') # Transform the training data using only the 'text' column values: count_train count_train = count_vectorizer.fit_transform(X_train) # Transform the test data using only the 'text' column values: count_test count_test = count_vectorizer.transform(X_test) # Print the first 10 features of the count_vectorizer print(count_vectorizer.get_feature_names()[:10])