Skip to content
This repository was archived by the owner on Jul 31, 2023. It is now read-only.

Commit 7f370d7

Browse files
committed
Added end to end sample for dataframe.
Change-Id: Icd916f4260d122e5415f7201e742b9c3e959bc4c
1 parent b8d5112 commit 7f370d7

File tree

11 files changed

+375
-15
lines changed

11 files changed

+375
-15
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
.idea/
2+
.ipynb_checkpoints/
23
.vscode/
34
tfrutil.workspace.code-workspace
45
__pycache__

README.md

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,38 @@
1-
# TFRecord Conversion Utilities
1+
# TFRecord Conversion Utilities
2+
3+
## Installing
4+
5+
1. Clone this repo.
6+
7+
2. Run the command `python3 setup.py`
8+
9+
## What is TFRUtil
10+
TFRUtil makes it easy to create TFRecords from images and labels using Pandas DataFrames or CSVs.
11+
Today, TFRUtil supports data stored in 'image csv format' similar to GCP AutoML Vision. In the
12+
future TFRUtil will support converting any dataframe or CSV file into TFRecords.
13+
14+
## Using TFRUtil to create TFRecords
15+
16+
### Image CSV Format
17+
TFRUtil currently expects data to be in the same format as [AutoML Vision](https://cloud.google.com/vision/automl/docs/prepare). This format looks like a pandas dataframe or CSV formatted as:
18+
19+
| split | image_uri | label |
20+
|-------|-------------------------|-------|
21+
| TRAIN | gs://foo/bar/image1.jpg | cat |
22+
23+
Where:
24+
* split can take on the values TRAIN, VALIDATION, and TEST
25+
* image_uri specifies a local or google cloud storage location for the image file.
26+
* label can be either a text based label that will be integerized or integer
27+
28+
### Pandas API
29+
TODO
30+
31+
### Python API
32+
TODO
33+
34+
### CSV File
35+
TODO
36+
37+
## Using TFRutil to inspect TFRecords
38+
TODO

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@ ipython >= 7.15.0
77
nose >= 1.3.7
88
pylint >= 2.5.3
99
fire >= 0.3.1
10+
jupyter >= 1.0.0
11+
tensorflow >= 2.2.0
Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Basic TFRUtil Usage\n",
8+
"\n",
9+
"This notebook demonstrates the basic usage of TFRUtil. It is meant to be run from the <repo>/sample/ path and uses test images included with TFRUtil stored in <repo>/tfrutil/test_data.\n",
10+
" \n",
11+
"Before running this notebook, please install TFUtil with the command `python setup.py` from the repository root."
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 1,
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"import pandas as pd\n",
21+
"import tfrutil"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": 2,
27+
"metadata": {},
28+
"outputs": [],
29+
"source": [
30+
"df = pd.read_csv(\"data.csv\")"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 3,
36+
"metadata": {},
37+
"outputs": [
38+
{
39+
"data": {
40+
"text/html": [
41+
"<div>\n",
42+
"<style scoped>\n",
43+
" .dataframe tbody tr th:only-of-type {\n",
44+
" vertical-align: middle;\n",
45+
" }\n",
46+
"\n",
47+
" .dataframe tbody tr th {\n",
48+
" vertical-align: top;\n",
49+
" }\n",
50+
"\n",
51+
" .dataframe thead th {\n",
52+
" text-align: right;\n",
53+
" }\n",
54+
"</style>\n",
55+
"<table border=\"1\" class=\"dataframe\">\n",
56+
" <thead>\n",
57+
" <tr style=\"text-align: right;\">\n",
58+
" <th></th>\n",
59+
" <th>split</th>\n",
60+
" <th>image_uri</th>\n",
61+
" <th>label</th>\n",
62+
" </tr>\n",
63+
" </thead>\n",
64+
" <tbody>\n",
65+
" <tr>\n",
66+
" <th>0</th>\n",
67+
" <td>TRAIN</td>\n",
68+
" <td>../tfrutil/test_data/images/cat/cat-640x853-1.jpg</td>\n",
69+
" <td>cat</td>\n",
70+
" </tr>\n",
71+
" <tr>\n",
72+
" <th>1</th>\n",
73+
" <td>VALIDATION</td>\n",
74+
" <td>../tfrutil/test_data/images/cat/cat-800x600-2.jpg</td>\n",
75+
" <td>cat</td>\n",
76+
" </tr>\n",
77+
" <tr>\n",
78+
" <th>2</th>\n",
79+
" <td>TEST</td>\n",
80+
" <td>../tfrutil/test_data/images/cat/cat-800x600-3.jpg</td>\n",
81+
" <td>cat</td>\n",
82+
" </tr>\n",
83+
" <tr>\n",
84+
" <th>3</th>\n",
85+
" <td>TRAIN</td>\n",
86+
" <td>../tfrutil/test_data/images/goat/goat-640x640-...</td>\n",
87+
" <td>goat</td>\n",
88+
" </tr>\n",
89+
" <tr>\n",
90+
" <th>4</th>\n",
91+
" <td>VALIDATION</td>\n",
92+
" <td>../tfrutil/test_data/images/goat/goat-320x320-...</td>\n",
93+
" <td>goat</td>\n",
94+
" </tr>\n",
95+
" <tr>\n",
96+
" <th>5</th>\n",
97+
" <td>TEST</td>\n",
98+
" <td>../tfrutil/test_data/images/goat/goat-640x427-...</td>\n",
99+
" <td>goat</td>\n",
100+
" </tr>\n",
101+
" </tbody>\n",
102+
"</table>\n",
103+
"</div>"
104+
],
105+
"text/plain": [
106+
" split image_uri label\n",
107+
"0 TRAIN ../tfrutil/test_data/images/cat/cat-640x853-1.jpg cat\n",
108+
"1 VALIDATION ../tfrutil/test_data/images/cat/cat-800x600-2.jpg cat\n",
109+
"2 TEST ../tfrutil/test_data/images/cat/cat-800x600-3.jpg cat\n",
110+
"3 TRAIN ../tfrutil/test_data/images/goat/goat-640x640-... goat\n",
111+
"4 VALIDATION ../tfrutil/test_data/images/goat/goat-320x320-... goat\n",
112+
"5 TEST ../tfrutil/test_data/images/goat/goat-640x427-... goat"
113+
]
114+
},
115+
"execution_count": 3,
116+
"metadata": {},
117+
"output_type": "execute_result"
118+
}
119+
],
120+
"source": [
121+
"df"
122+
]
123+
},
124+
{
125+
"cell_type": "code",
126+
"execution_count": 4,
127+
"metadata": {
128+
"scrolled": true
129+
},
130+
"outputs": [
131+
{
132+
"name": "stdout",
133+
"output_type": "stream",
134+
"text": [
135+
"Starting DataFlow Transform. This may take a while. Please wait.\n"
136+
]
137+
},
138+
{
139+
"data": {
140+
"application/javascript": [
141+
"\n",
142+
" if (typeof window.interactive_beam_jquery == 'undefined') {\n",
143+
" var jqueryScript = document.createElement('script');\n",
144+
" jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n",
145+
" jqueryScript.type = 'text/javascript';\n",
146+
" jqueryScript.onload = function() {\n",
147+
" var datatableScript = document.createElement('script');\n",
148+
" datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n",
149+
" datatableScript.type = 'text/javascript';\n",
150+
" datatableScript.onload = function() {\n",
151+
" window.interactive_beam_jquery = jQuery.noConflict(true);\n",
152+
" window.interactive_beam_jquery(document).ready(function($){\n",
153+
" \n",
154+
" });\n",
155+
" }\n",
156+
" document.head.appendChild(datatableScript);\n",
157+
" };\n",
158+
" document.head.appendChild(jqueryScript);\n",
159+
" } else {\n",
160+
" window.interactive_beam_jquery(document).ready(function($){\n",
161+
" \n",
162+
" });\n",
163+
" }"
164+
]
165+
},
166+
"metadata": {},
167+
"output_type": "display_data"
168+
},
169+
{
170+
"data": {
171+
"application/javascript": [
172+
"\n",
173+
" var import_html = () => {\n",
174+
" ['https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html'].forEach(href => {\n",
175+
" var link = document.createElement('link');\n",
176+
" link.rel = 'import'\n",
177+
" link.href = href;\n",
178+
" document.head.appendChild(link);\n",
179+
" });\n",
180+
" }\n",
181+
" if ('import' in document.createElement('link')) {\n",
182+
" import_html();\n",
183+
" } else {\n",
184+
" var webcomponentScript = document.createElement('script');\n",
185+
" webcomponentScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js';\n",
186+
" webcomponentScript.type = 'text/javascript';\n",
187+
" webcomponentScript.onload = function(){\n",
188+
" import_html();\n",
189+
" };\n",
190+
" document.head.appendChild(webcomponentScript);\n",
191+
" }"
192+
]
193+
},
194+
"metadata": {},
195+
"output_type": "display_data"
196+
},
197+
{
198+
"name": "stdout",
199+
"output_type": "stream",
200+
"text": [
201+
"TFRecords created. Output stored in ./out\n"
202+
]
203+
}
204+
],
205+
"source": [
206+
"df.tensorflow.to_tfr(output_path=\"./out\")"
207+
]
208+
},
209+
{
210+
"cell_type": "code",
211+
"execution_count": 7,
212+
"metadata": {},
213+
"outputs": [
214+
{
215+
"name": "stdout",
216+
"output_type": "stream",
217+
"text": [
218+
"./out:\r\n",
219+
"tfrutil-20200629-192138-to-tfr\ttfrutil-beam.log\r\n",
220+
"\r\n",
221+
"./out/tfrutil-20200629-192138-to-tfr:\r\n",
222+
"discarded-data-00000-of-00001\t train-00000-of-00001.tfrecord.gz\r\n",
223+
"schema.pbtxt\t\t\t transformed_metadata\r\n",
224+
"test-00000-of-00001.tfrecord.gz transform_fn\r\n",
225+
"tft_tmp\t\t\t\t val-00000-of-00001.tfrecord.gz\r\n",
226+
"\r\n",
227+
"./out/tfrutil-20200629-192138-to-tfr/tft_tmp:\r\n",
228+
"tftransform_tmp\r\n",
229+
"\r\n",
230+
"./out/tfrutil-20200629-192138-to-tfr/tft_tmp/tftransform_tmp:\r\n",
231+
"5584526e1f6f4d7d9f5d25ad896a1ecf 940f1e45d0fa4c8f93e58ac8fb7cdc5c\r\n",
232+
"8441d43a7b774700bdda1a61797ab274 vocab_compute_and_apply_vocabulary_vocabulary\r\n",
233+
"\r\n",
234+
"./out/tfrutil-20200629-192138-to-tfr/tft_tmp/tftransform_tmp/5584526e1f6f4d7d9f5d25ad896a1ecf:\r\n",
235+
"saved_model.pb\tvariables\r\n",
236+
"\r\n",
237+
"./out/tfrutil-20200629-192138-to-tfr/tft_tmp/tftransform_tmp/5584526e1f6f4d7d9f5d25ad896a1ecf/variables:\r\n",
238+
"\r\n",
239+
"./out/tfrutil-20200629-192138-to-tfr/tft_tmp/tftransform_tmp/8441d43a7b774700bdda1a61797ab274:\r\n",
240+
"assets\tsaved_model.pb\tvariables\r\n",
241+
"\r\n",
242+
"./out/tfrutil-20200629-192138-to-tfr/tft_tmp/tftransform_tmp/8441d43a7b774700bdda1a61797ab274/assets:\r\n",
243+
"vocab_compute_and_apply_vocabulary_vocabulary\r\n",
244+
"\r\n",
245+
"./out/tfrutil-20200629-192138-to-tfr/tft_tmp/tftransform_tmp/8441d43a7b774700bdda1a61797ab274/variables:\r\n",
246+
"\r\n",
247+
"./out/tfrutil-20200629-192138-to-tfr/tft_tmp/tftransform_tmp/940f1e45d0fa4c8f93e58ac8fb7cdc5c:\r\n",
248+
"saved_model.pb\tvariables\r\n",
249+
"\r\n",
250+
"./out/tfrutil-20200629-192138-to-tfr/tft_tmp/tftransform_tmp/940f1e45d0fa4c8f93e58ac8fb7cdc5c/variables:\r\n",
251+
"\r\n",
252+
"./out/tfrutil-20200629-192138-to-tfr/transformed_metadata:\r\n",
253+
"schema.pbtxt\r\n",
254+
"\r\n",
255+
"./out/tfrutil-20200629-192138-to-tfr/transform_fn:\r\n",
256+
"assets\tsaved_model.pb\tvariables\r\n",
257+
"\r\n",
258+
"./out/tfrutil-20200629-192138-to-tfr/transform_fn/assets:\r\n",
259+
"vocab_compute_and_apply_vocabulary_vocabulary\r\n",
260+
"\r\n",
261+
"./out/tfrutil-20200629-192138-to-tfr/transform_fn/variables:\r\n"
262+
]
263+
}
264+
],
265+
"source": [
266+
"!ls -R ./out"
267+
]
268+
},
269+
{
270+
"cell_type": "markdown",
271+
"metadata": {},
272+
"source": [
273+
"# That's it!\n",
274+
"\n",
275+
"As you can see, TFRUtil has taken the supplied CSV and transformed it into TFRecords, ready for consumption, along with the transform function"
276+
]
277+
}
278+
],
279+
"metadata": {
280+
"kernelspec": {
281+
"display_name": "Python 3",
282+
"language": "python",
283+
"name": "python3"
284+
},
285+
"language_info": {
286+
"codemirror_mode": {
287+
"name": "ipython",
288+
"version": 3
289+
},
290+
"file_extension": ".py",
291+
"mimetype": "text/x-python",
292+
"name": "python",
293+
"nbconvert_exporter": "python",
294+
"pygments_lexer": "ipython3",
295+
"version": "3.7.7"
296+
}
297+
},
298+
"nbformat": 4,
299+
"nbformat_minor": 4
300+
}

samples/data.csv

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
split,image_uri,label
2+
TRAIN,../tfrutil/test_data/images/cat/cat-640x853-1.jpg,cat
3+
VALIDATION,../tfrutil/test_data/images/cat/cat-800x600-2.jpg,cat
4+
TEST,../tfrutil/test_data/images/cat/cat-800x600-3.jpg,cat
5+
TRAIN,../tfrutil/test_data/images/goat/goat-640x640-1.jpg,goat
6+
VALIDATION,../tfrutil/test_data/images/goat/goat-320x320-2.jpg,goat
7+
TEST,../tfrutil/test_data/images/goat/goat-640x427-3.jpg,goat

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"nose >= 1.3.7",
2929
"pylint >= 2.5.3",
3030
"fire >= 0.3.1",
31+
"tensorflow >= 2.2.0"
3132
]
3233

3334

tfrutil/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,5 @@
1515
# limitations under the License.
1616

1717
"""Imports."""
18-
19-
from tfrutil.client import create_tfrecords
18+
from tfrutil import accessor
19+
from tfrutil import client

0 commit comments

Comments
 (0)