@EdWorkingPaper{ai25-1335, title = "ChatGPT vs. Machine Learning: Assessing the Efficacy and Accuracy of Large Language Models for Automated Essay Scoring", author = "Youngwon Kim, Reagan Mozer, Shireen Al-Adeimi, Luke Miratrix", institution = "Annenberg Institute at Brown University", number = "1335", year = "2025", month = "November", URL = "http://www.edworkingpapers.com/ai25-1335", abstract = {Automated Essay Scoring (AES) is a critical tool in education that aims to enhance the efficiency and objectivity of educational assessments. Recent advancements in Large Language Models (LLMs), such as ChatGPT, have sparked interest in their potential for AES. However, comprehensive comparisons of LLM-based methods with traditional machine learning (ML) methods across different assessment contexts remain limited. This study compares the efficacy of LLMs with supervised ML algorithms in assessing both categorical essay opinions and continuous writing quality scores. Using two distinct datasets–argumentative essays from 4th-7th graders about iPad usage in schools, and persuasive essays from 10th graders on censorship in libraries–we systematically assess the performance of ChatGPT compared to four tree-based ML algorithms trained on extensive statistical text features. Our findings show that while LLMs perform well in essay classification tasks, ML methods consistently outperform LLMs in predicting writing quality. We highlight the importance of prompting and fine tuning techniques in LLM-based scoring, along with the strengths and limitations of both approaches. We also discuss the potential of LLMs to enhance AES in educational settings while underscoring the continued importance of human oversight in evaluating complex writing skills. Overall, this study demonstrates the complementary strengths of different approaches to AES, providing guidance for researchers and educators interested in leveraging LLMs in educational assessment.}, }