@inproceedings{chae-etal-2024-coffee,
title = "Coffee-Gym: An Environment for Evaluating and Improving Natural Language Feedback on Erroneous Code",
author = "Chae, Hyungjoo and
Kwon, Taeyoon and
Moon, Seungjun and
Song, Yongho and
Kang, Dongjin and
Ong, Kai Tzu-iunn and
Kwak, Beong-woo and
Bae, Seonghyeon and
Hwang, Seung-won and
Yeo, Jinyoung",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1254",
pages = "22503--22524",
abstract = "This paper presents Coffee-Gym, a comprehensive RL environment for training models that provide feedback on code editing. Coffee-Gym includes two major components: (1) Coffee, a dataset containing humans{'} code edit traces for coding questions and human-written feedback for editing erroneous code; (2) CoffeeEval, a reward function that faithfully reflects the helpfulness of feedback by assessing the performance of the revised code in unit tests. With them, Coffee-Gym addresses the unavailability of high-quality datasets for training feedback models with RL, and provides more accurate rewards than the SOTA reward model (i.e., GPT-4). By applying Coffee-Gym, we elicit feedback models that outperform baselines in enhancing open-source code LLMs{'} code editing, making them comparable with closed-source LLMs. We make the dataset and the model checkpoint publicly available in https://huggingface.co/spaces/Coffee-Gym/Project-Coffee-Gym.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chae-etal-2024-coffee">
<titleInfo>
<title>Coffee-Gym: An Environment for Evaluating and Improving Natural Language Feedback on Erroneous Code</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hyungjoo</namePart>
<namePart type="family">Chae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taeyoon</namePart>
<namePart type="family">Kwon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seungjun</namePart>
<namePart type="family">Moon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongho</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongjin</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="given">Tzu-iunn</namePart>
<namePart type="family">Ong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beong-woo</namePart>
<namePart type="family">Kwak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seonghyeon</namePart>
<namePart type="family">Bae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-won</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinyoung</namePart>
<namePart type="family">Yeo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents Coffee-Gym, a comprehensive RL environment for training models that provide feedback on code editing. Coffee-Gym includes two major components: (1) Coffee, a dataset containing humans’ code edit traces for coding questions and human-written feedback for editing erroneous code; (2) CoffeeEval, a reward function that faithfully reflects the helpfulness of feedback by assessing the performance of the revised code in unit tests. With them, Coffee-Gym addresses the unavailability of high-quality datasets for training feedback models with RL, and provides more accurate rewards than the SOTA reward model (i.e., GPT-4). By applying Coffee-Gym, we elicit feedback models that outperform baselines in enhancing open-source code LLMs’ code editing, making them comparable with closed-source LLMs. We make the dataset and the model checkpoint publicly available in https://huggingface.co/spaces/Coffee-Gym/Project-Coffee-Gym.</abstract>
<identifier type="citekey">chae-etal-2024-coffee</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1254</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>22503</start>
<end>22524</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Coffee-Gym: An Environment for Evaluating and Improving Natural Language Feedback on Erroneous Code
%A Chae, Hyungjoo
%A Kwon, Taeyoon
%A Moon, Seungjun
%A Song, Yongho
%A Kang, Dongjin
%A Ong, Kai Tzu-iunn
%A Kwak, Beong-woo
%A Bae, Seonghyeon
%A Hwang, Seung-won
%A Yeo, Jinyoung
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F chae-etal-2024-coffee
%X This paper presents Coffee-Gym, a comprehensive RL environment for training models that provide feedback on code editing. Coffee-Gym includes two major components: (1) Coffee, a dataset containing humans’ code edit traces for coding questions and human-written feedback for editing erroneous code; (2) CoffeeEval, a reward function that faithfully reflects the helpfulness of feedback by assessing the performance of the revised code in unit tests. With them, Coffee-Gym addresses the unavailability of high-quality datasets for training feedback models with RL, and provides more accurate rewards than the SOTA reward model (i.e., GPT-4). By applying Coffee-Gym, we elicit feedback models that outperform baselines in enhancing open-source code LLMs’ code editing, making them comparable with closed-source LLMs. We make the dataset and the model checkpoint publicly available in https://huggingface.co/spaces/Coffee-Gym/Project-Coffee-Gym.
%U https://aclanthology.org/2024.emnlp-main.1254
%P 22503-22524
Markdown (Informal)
[Coffee-Gym: An Environment for Evaluating and Improving Natural Language Feedback on Erroneous Code](https://aclanthology.org/2024.emnlp-main.1254) (Chae et al., EMNLP 2024)
ACL
- Hyungjoo Chae, Taeyoon Kwon, Seungjun Moon, Yongho Song, Dongjin Kang, Kai Tzu-iunn Ong, Beong-woo Kwak, Seonghyeon Bae, Seung-won Hwang, and Jinyoung Yeo. 2024. Coffee-Gym: An Environment for Evaluating and Improving Natural Language Feedback on Erroneous Code. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 22503–22524, Miami, Florida, USA. Association for Computational Linguistics.