diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000000..528f30c71c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000000..048c3d2f04 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + remote = storage + autostage = true +['remote "storage"'] + url = s3://dvc-public/remote/dvc-org/blogs-media diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000000..5197305523 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.husky/post-checkout b/.husky/post-checkout new file mode 100755 index 0000000000..5b3922fde0 --- /dev/null +++ b/.husky/post-checkout @@ -0,0 +1,2 @@ +#!/bin/sh +exec dvc checkout diff --git a/.husky/pre-push b/.husky/pre-push new file mode 100755 index 0000000000..7fb87365a7 --- /dev/null +++ b/.husky/pre-push @@ -0,0 +1,4 @@ +#!/bin/sh +if [[ `git --no-pager diff --name-only main '*.dvc'` ]]; then + exec dvc push +fi diff --git a/app.json b/app.json index 3fbb194c7b..4f95b5573f 100644 --- a/app.json +++ b/app.json @@ -1,6 +1,9 @@ { "addons": [], "buildpacks": [ + { + "url": "heroku/python" + }, { "url": "heroku/nodejs" } diff --git a/content/.gitignore b/content/.gitignore new file mode 100644 index 0000000000..30c72a71ba --- /dev/null +++ b/content/.gitignore @@ -0,0 +1 @@ +/uploads diff --git a/content/authors/0x2b3bfa0.md b/content/authors/0x2b3bfa0.md new file mode 100644 index 0000000000..2c86905437 --- /dev/null +++ b/content/authors/0x2b3bfa0.md @@ -0,0 +1,9 @@ +--- +name: Helio Machado +avatar: 0x2b3bfa0.jpg +links: + - https://github.com/0x2b3bfa0 +--- + +Evergreen polymath with a taste for security, open technologies and expressive +code. diff --git a/content/authors/aguschin.md b/content/authors/aguschin.md new file mode 100644 index 0000000000..a66dddc6f2 --- /dev/null +++ b/content/authors/aguschin.md @@ -0,0 +1,8 @@ +--- +name: Alexander Guschin +avatar: aguschin.jpeg +links: + - https://www.linkedin.com/in/1aguschin/ +--- + +Technical Product Engineer at [Iterative](https://iterative.ai/) diff --git a/content/authors/alex_kim.md b/content/authors/alex_kim.md new file mode 100644 index 0000000000..92b6103cb3 --- /dev/null +++ b/content/authors/alex_kim.md @@ -0,0 +1,12 @@ +--- +name: Alex Kim +avatar: alex_kim.png +links: + - https://www.linkedin.com/in/alex000kim/ + - https://github.com/alex000kim/ +--- + +Independent Consultant, MLOps Engineer, Open-Source Contributor, and Technical +Instructor. + +[alex000kim.com](https://alex000kim.com/) diff --git a/content/authors/batuhan_taskaya.md b/content/authors/batuhan_taskaya.md new file mode 100644 index 0000000000..4b4aed54fa --- /dev/null +++ b/content/authors/batuhan_taskaya.md @@ -0,0 +1,8 @@ +--- +name: Batuhan Taskaya +avatar: batuhan_taskaya.png +links: + - https://twitter.com/isidentical +--- + +Software Engineer at [DVC](https://dvc.org) diff --git a/content/authors/casper_dcl.md b/content/authors/casper_dcl.md new file mode 100644 index 0000000000..eb030e1a6a --- /dev/null +++ b/content/authors/casper_dcl.md @@ -0,0 +1,11 @@ +--- +name: Casper da Costa-Luis +avatar: casper_dcl.jpg +links: + - https://github.com/casperdcl +--- + +Computational Physicist; Python Software Foundation (PSF) voting member; GitHub +OS maintainers member. [tqdm](https://github.com/tqdm/tqdm) primary maintainer. + +Expertise in C++/CUDA/Python/Git/Docker. diff --git a/content/authors/daniel_kharitonov.md b/content/authors/daniel_kharitonov.md new file mode 100644 index 0000000000..2e864db956 --- /dev/null +++ b/content/authors/daniel_kharitonov.md @@ -0,0 +1,8 @@ +--- +name: Daniel Kharitonov +avatar: daniel_kharitonov.png +links: + - https://www.linkedin.com/in/danielkharitonov/ +--- + +Technical Product Manager at [DVC.ai}(https://dvc.ai)] diff --git a/content/authors/dave_berenbaum.md b/content/authors/dave_berenbaum.md new file mode 100644 index 0000000000..ba45d21db5 --- /dev/null +++ b/content/authors/dave_berenbaum.md @@ -0,0 +1,8 @@ +--- +name: Dave Berenbaum +avatar: dave_berenbaum.png +links: + - https://www.linkedin.com/in/david-berenbaum-20b6b424/ +--- + +Technical Product Manager at [Iterative](https://iterative.ai/) diff --git a/content/authors/david_g_ortega.md b/content/authors/david_g_ortega.md new file mode 100644 index 0000000000..12eaa2cab8 --- /dev/null +++ b/content/authors/david_g_ortega.md @@ -0,0 +1,8 @@ +--- +name: David G Ortega +avatar: david_g_ortega.png +links: + - https://github.com/DavidGOrtega +--- + +Founder; Techstars Alumni; Software Engineer; Machine Learning Engineer; diff --git a/content/authors/diglesia.md b/content/authors/diglesia.md new file mode 100644 index 0000000000..d4c4e059b1 --- /dev/null +++ b/content/authors/diglesia.md @@ -0,0 +1,8 @@ +--- +name: David de la Iglesia +avatar: diglesia.jpg +links: + - https://github.com/daavoo +--- + +Software Engineer at [Iterative](https://iterative.ai/) diff --git a/content/authors/dmitry_petrov.md b/content/authors/dmitry_petrov.md new file mode 100644 index 0000000000..ce7c219a33 --- /dev/null +++ b/content/authors/dmitry_petrov.md @@ -0,0 +1,11 @@ +--- +name: Dmitry Petrov +avatar: dmitry_petrov.png +links: + - https://twitter.com/fullstackml + - https://www.linkedin.com/in/dmitryleopetrov +--- + +Creator of [http://dvc.org](http://dvc.org) — Git for ML. Ex-Data Scientist +[@Microsoft](http://twitter.com/Microsoft). PhD in CS. Making jokes with a +serious face. diff --git a/content/authors/dom_miketa.md b/content/authors/dom_miketa.md new file mode 100644 index 0000000000..26d233449a --- /dev/null +++ b/content/authors/dom_miketa.md @@ -0,0 +1,9 @@ +--- +name: Dom Miketa +avatar: dom_miketa.jpeg +links: + - https://www.exscientia.ai + - https://www.linkedin.com/in/dom-miketa-1815b7198 +--- + +Senior AI research assistant at [Exscientia](https://www.exscientia.ai/). diff --git a/content/authors/elle_obrien.md b/content/authors/elle_obrien.md new file mode 100644 index 0000000000..c7be16a4e0 --- /dev/null +++ b/content/authors/elle_obrien.md @@ -0,0 +1,8 @@ +--- +name: Elle O'Brien +avatar: elle_obrien.jpg +links: + - https://twitter.com/DrElleOBrien +--- + +Data scientist at [http://dvc.org](http://dvc.org) diff --git a/content/authors/gema_parreno.md b/content/authors/gema_parreno.md new file mode 100644 index 0000000000..098ffee466 --- /dev/null +++ b/content/authors/gema_parreno.md @@ -0,0 +1,8 @@ +--- +name: Gema Parreno +avatar: gema_parreno.jpeg +links: + - https://github.com/SoyGema +--- + +Developer Advocate at [Iterative](https://iterative.ai/) diff --git a/content/authors/george_vyshnya.md b/content/authors/george_vyshnya.md new file mode 100644 index 0000000000..b1c92bd406 --- /dev/null +++ b/content/authors/george_vyshnya.md @@ -0,0 +1,10 @@ +--- +name: George Vyshnya +avatar: george_vyshnya.jpeg +links: + - https://www.linkedin.com/in/gvyshnya +--- + +Seasoned Data Scientist / Software Developer with blended experience in software +development, IT, DevOps, PM and C-level roles. CTO at +[http://sbc-group.pl](http://sbc-group.pl) diff --git a/content/authors/guro_bokum.md b/content/authors/guro_bokum.md new file mode 100644 index 0000000000..b641a9f3f1 --- /dev/null +++ b/content/authors/guro_bokum.md @@ -0,0 +1,8 @@ +--- +name: Guro Bokum +avatar: guro_bokum.jpg +links: + - https://www.linkedin.com/in/gurobokum/ +--- + +Senior Software Engineer at [Iterative](https://iterative.ai/) diff --git a/content/authors/jeny_defigueiredo.md b/content/authors/jeny_defigueiredo.md new file mode 100644 index 0000000000..dcf00954ec --- /dev/null +++ b/content/authors/jeny_defigueiredo.md @@ -0,0 +1,8 @@ +--- +name: Jeny De Figueiredo +avatar: jeny_defigueiredo.png +links: + - https://twitter.com/jendefig +--- + +Community Manager at [DVC](https://dvc.org) diff --git a/content/authors/jorge_orpinel.md b/content/authors/jorge_orpinel.md new file mode 100644 index 0000000000..95c8754b96 --- /dev/null +++ b/content/authors/jorge_orpinel.md @@ -0,0 +1,8 @@ +--- +name: Jorge Orpinel Pérez +avatar: jorge.jpg +links: + - https://www.linkedin.com/in/jorgeorpinel +--- + +Technical writer and developer at [dvc.org](http://dvc.org/) diff --git a/content/authors/luis_yanes.md b/content/authors/luis_yanes.md new file mode 100644 index 0000000000..8727a47457 --- /dev/null +++ b/content/authors/luis_yanes.md @@ -0,0 +1,9 @@ +--- +name: Luis Yanes +avatar: luis_yanes.png +links: + - https://www.exscientia.ai + - https://www.linkedin.com/in/ljyanesm +--- + +Senior software engineer at [Exscientia](https://www.exscientia.ai/). diff --git a/content/authors/marcel_rd.md b/content/authors/marcel_rd.md new file mode 100644 index 0000000000..66ddada930 --- /dev/null +++ b/content/authors/marcel_rd.md @@ -0,0 +1,16 @@ +--- +name: Marcel Ribeiro-Dantas +avatar: marcel.jpg +links: + - https://twitter.com/mribeirodantas +--- + +Early Stage Researcher at [Institut Curie](https://institut-curie.org/) with +over 10 years of experience in the field of biomedical engineering and health +informatics. Areas of interest include Causal Inference, Artificial +Intelligence, and Data Science. Degrees in Computer and Automation Engineering +(Eng), Big Data (Grad degree), and Bioinformatics (MSc). Currently enrolled in a +Ph.D. at EDITE (Sorbonne Université). + +Twitter: [@mribeirodantas](https://twitter.com/mribeirodantas) Website: +[mribeirodantas.me](http://mribeirodantas.me) diff --git a/content/authors/maria_khalusova.md b/content/authors/maria_khalusova.md new file mode 100644 index 0000000000..910356a225 --- /dev/null +++ b/content/authors/maria_khalusova.md @@ -0,0 +1,8 @@ +--- +name: Maria Khalusova +avatar: maria_khalusova.jpg +links: + - https://twitter.com/mariaKhalusova +--- + +Senior Developer Advocate at [Iterative](https://iterative.ai) diff --git a/content/authors/marija_ilic.md b/content/authors/marija_ilic.md new file mode 100644 index 0000000000..fe23e8ea70 --- /dev/null +++ b/content/authors/marija_ilic.md @@ -0,0 +1,8 @@ +--- +name: Marija Ilić +avatar: marija_ilic.png +links: + - https://www.linkedin.com/in/marija-ili%C4%87-65b8a53 +--- + +Data scientist at Njuškalo, Croatia. diff --git a/content/authors/maxim_shmakov.md b/content/authors/maxim_shmakov.md new file mode 100644 index 0000000000..48021520ff --- /dev/null +++ b/content/authors/maxim_shmakov.md @@ -0,0 +1,8 @@ +--- +name: Maxim Shmakov +avatar: maxim-shmakov.jpg +links: + - https://github.com/mvshmakov +--- + +Front End Developer at [Iterative](https://iterative.ai) diff --git a/content/authors/mert_bozkir.md b/content/authors/mert_bozkir.md new file mode 100644 index 0000000000..30da6b4cac --- /dev/null +++ b/content/authors/mert_bozkir.md @@ -0,0 +1,8 @@ +--- +name: Mert Bozkir +avatar: mert_bozkir.jpg +links: + - https://www.linkedin.com/in/mertbozkir/ +--- + +Community Coordinator at [Iterative](https://iterative.ai/) diff --git a/content/authors/mike0sv.md b/content/authors/mike0sv.md new file mode 100644 index 0000000000..d7ea92986b --- /dev/null +++ b/content/authors/mike0sv.md @@ -0,0 +1,8 @@ +--- +name: Mike Sveshnikov +avatar: mike0sv.jpeg +links: + - https://www.linkedin.com/in/mike0sv/ +--- + +MLEM Project Team Lead at [Iterative](https://iterative.ai/) diff --git a/content/authors/mikhail_rozhkov.md b/content/authors/mikhail_rozhkov.md new file mode 100644 index 0000000000..83ea329970 --- /dev/null +++ b/content/authors/mikhail_rozhkov.md @@ -0,0 +1,8 @@ +--- +name: Mikhail Rozhkov +avatar: mikhail_rozhkov.jpeg +links: + - https://www.linkedin.com/in/mnrozhkov/ +--- + +Solutions Engineer at [Iterative](https://iterative.ai/) diff --git a/content/authors/milecia_mcgregor.md b/content/authors/milecia_mcgregor.md new file mode 100644 index 0000000000..15b9762faf --- /dev/null +++ b/content/authors/milecia_mcgregor.md @@ -0,0 +1,8 @@ +--- +name: Milecia McGregor +avatar: milecia_mcgregor.jpg +links: + - https://twitter.com/flippedcoding +--- + +Developer Advocate at [DVC](https://dvc.org) diff --git a/content/authors/peter_rowlands.md b/content/authors/peter_rowlands.md new file mode 100644 index 0000000000..7e71f62fd9 --- /dev/null +++ b/content/authors/peter_rowlands.md @@ -0,0 +1,9 @@ +--- +name: Peter Rowlands +avatar: peter_rowlands.jpg +links: + - https://github.com/pmrowla + - https://www.linkedin.com/in/pmrowla +--- + +Software engineer at [dvc.org](https://dvc.org/) diff --git a/content/authors/peter_zikan.md b/content/authors/peter_zikan.md new file mode 100644 index 0000000000..5b4519bd24 --- /dev/null +++ b/content/authors/peter_zikan.md @@ -0,0 +1,8 @@ +--- +name: Petr Zikán +avatar: peter-zikan.jpeg +links: + - https://www.linkedin.com/in/petr-zik%C3%A1n-88054815a/ +--- + +CTO at [PlasmaSolve](https://plasmasolve.com) diff --git a/content/authors/rob_dewit.md b/content/authors/rob_dewit.md new file mode 100644 index 0000000000..ac386c7a6b --- /dev/null +++ b/content/authors/rob_dewit.md @@ -0,0 +1,8 @@ +--- +name: Rob de Wit +avatar: rob_dewit.jpg +links: + - https://www.linkedin.com/in/rcdewit/ +--- + +Developer Advocate at [Iterative](https://iterative.ai) diff --git a/content/authors/ryan.md b/content/authors/ryan.md new file mode 100644 index 0000000000..69532151c1 --- /dev/null +++ b/content/authors/ryan.md @@ -0,0 +1,8 @@ +--- +name: Ryan Turner +avatar: ryan.jpg +links: + - https://twitter.com/otterkoala +--- + +ML Solutions Engineer at [DVC](https://dvc.org) diff --git a/content/authors/svetlana_grinchenko.md b/content/authors/svetlana_grinchenko.md new file mode 100644 index 0000000000..e831a6f66a --- /dev/null +++ b/content/authors/svetlana_grinchenko.md @@ -0,0 +1,8 @@ +--- +name: Svetlana Grinchenko +avatar: svetlana_grinchenko.jpeg +links: + - https://twitter.com/a142hr +--- + +Head of developer relations at [http://dvc.org](http://dvc.org) diff --git a/content/authors/tapa_dipti_sitaula.md b/content/authors/tapa_dipti_sitaula.md new file mode 100644 index 0000000000..a991447775 --- /dev/null +++ b/content/authors/tapa_dipti_sitaula.md @@ -0,0 +1,8 @@ +--- +name: Tapa Dipti Sitaula +avatar: tapa_dipti_sitaula.png +links: + - https://www.linkedin.com/in/tapa-dipti-sitaula/ +--- + +Sr Product Engineer at [Iterative](https://iterative.ai/) diff --git a/content/authors/tibor_mach.md b/content/authors/tibor_mach.md new file mode 100644 index 0000000000..addf68ec4c --- /dev/null +++ b/content/authors/tibor_mach.md @@ -0,0 +1,9 @@ +--- +name: Tibor Mach +avatar: tibor_mach.jpeg +links: + - https://github.com/tibor-mach + - https://www.linkedin.com/in/tibor-mach/ +--- + +ML/MLOps engineer, ex data science architect at Atos, PhD in maths, drummer diff --git a/content/blogs/2017-05-15-how-data-scientists-can-improve-their-productivity.md b/content/blogs/2017-05-15-how-data-scientists-can-improve-their-productivity.md new file mode 100644 index 0000000000..720bec196d --- /dev/null +++ b/content/blogs/2017-05-15-how-data-scientists-can-improve-their-productivity.md @@ -0,0 +1,169 @@ +--- +title: How Data Scientists Can Improve Their Productivity +date: 2017-05-15 +description: > + Data science and machine learning are iterative processes. It is never + possible to successfully complete a data science project in a single pass. +descriptionLong: > + The iteration time is a critical parameter in data science process. The + quicker you iterate, the more you can check ideas and build a better model. + The productivity of data scientists can be improved by speeding up iteration + processes and the DVC tool takes care of this. +picture: 2017-05-15/post-image.jpg +author: dmitry_petrov +commentsUrl: https://discuss.dvc.org/t/how-a-data-scientist-can-improve-their-productivity/301 +tags: + - Productivity + - Python + - Tutorial +--- + +Data science and machine learning are iterative processes. It is never possible +to successfully complete a data science project in a single pass. A data +scientist constantly tries new ideas and changes steps of her pipeline: + +1. extract new features and accidentally find noise in the data; + +2. clean up the noise, find one more promising feature; + +3. extract the new feature; + +4. rebuild and validate the model, realize that the learning algorithm + parameters are not perfect for the new feature set; + +5. change machine learning algorithm parameters and retrain the model; + +6. find the ineffective feature subset and remove it from the feature set; + +7. try a few more new features; + +8. try another ML algorithm. And then a data format change is required. + +This is only a small episode in a data scientist’s daily life and it is what +makes our job different from a regular engineering job. + +Business context, ML algorithm knowledge and intuition all help you to find a +good model faster. But you never know for sure what ideas will bring you the +best value. + +This is why the iteration time is a critical parameter in data science process. +The quicker you iterate, the more you can check ideas and build a better model. + +> “A well-engineered pipeline gets data scientists iterating much faster, which +> can be a big competitive edge” From +> [Engineering Practices in Data Science](http://blog.untrod.com/2012/10/engineering-practices-in-data-science.html) +> By Chris Clark. + +## A data science iteration tool + +To speed up the iterations in data science projects we have created an open +source tool [data version control](http://dvc.org) or [DVC.org](http://dvc.org). + +DVC takes care of dependencies between commands that you run, generated data +files, and code files and allows you to easily reproduce any steps of your +research with regards to files changes. + +You can think about DVC as a Makefile for a data science project even though you +do not create a file explicitly. DVC tracks dependencies in your data science +projects when you run data processing or modeling code through a special +command: + +```dvc +$ dvc run python code/xml_to_tsv.py \ + data/Posts.xml data/Posts.tsv +``` + +`dvc run` works as a proxy for your commands. This allows DVC to track input and +output files, construct the dependency graph +([DAG](https://en.wikipedia.org/wiki/Directed_acyclic_graph)), and store the +command and parameters for a future command reproduction. + +The previous command will be automatically piped with the next command because +of the file `data/Posts.tsv` is an output for the previous command and the input +for the next one: + +```dvc +# Split training and testing dataset. Two output files. +# 0.33 is the test dataset splitting ratio. +# 20170426 is a seed for randomization. +$ dvc run python code/split_train_test.py \ + data/Posts.tsv 0.33 20170426 \ + data/Posts-train.tsv data/Posts-test.tsv +``` + +DVC derives the dependencies automatically by looking to the list of the +parameters (even if your code ignores the parameters) and noting the file +changes before and after running the command. + +If you change one of your dependencies (data or code) then all the affected +steps of the pipeline will be reproduced: + +```dvc +# Change the data preparation code. +$ vi code/xml_to_tsv.py + +# Reproduce. +$ dvc repro data/Posts-train.tsv +Reproducing run command for data item data/Posts.tsv. +Reproducing run command for data item data/Posts-train.tsv. +``` + +This is a powerful way of quickly iterating through your pipeline. + +The pipeline might have a lot of steps and forms of acyclic dependencies between +the steps. Below is an example of a canonical machine learning pipeline (more +details in [the DVC tutorials](https://dvc.org/doc/tutorials): + +`gist:dmpetrov/7704a5156bdc32c7379580a61e2fe3b6#dvc_pipeline.sh` + +## Why are regular pipeline tools not enough? + +> “Workflows are expected to be mostly static or slowly changing.” (See +> [Airflow](https://airflow.incubator.apache.org/).) + +Regular pipeline tools like [Airflow](http://airflow.incubator.apache.org) and +[Luigi](https://github.com/spotify/luigi) are good for representing static and +fault tolerant workflows. A huge portion of their functionality is created for +monitoring, optimization and fault tolerance. These are very important and +business critical problems. However, these problems are irrelevant to data +scientists’ daily lives. + +Data scientists need a lightweight, dynamic workflow management system. In +contrast to the traditional airflow-like system, DVC reflects the process of +researching and looking for a great model (and pipeline), not optimizing and +monitoring an existing one. This is why DVC is a good fit for iterative machine +learning processes. When a good model was discovered with DVC, the result could +be incorporated into a data engineering pipeline (Luigi or Airflow). + +## Pipelines and data sharing + +In addition to pipeline description, data reproduction and dynamic nature, DVC +has one more important feature. It was designed in accordance with the best +software engineering practices. DVC is based on Git. It keeps code, and stores +DAG in the Git repository which allows you to share your research results. But +it moves the actual file content outside the Git repository (in `.cache` +directory which DVC includes in `.gitignore`) since Git is not designed to +accommodate large data files. + +The data files can be shared between data scientists through cloud storages +using a simple command: + +```dvc +# Data scientists 1 syncs data to the cloud. +$ dvc sync data/ +``` + +![](../uploads/images/2017-05-15/git-server-or-github.jpeg) + +Currently, AWS S3 and GCP storage are supported by DVC. + +## Conclusion + +The productivity of data scientists can be improved by speeding up iteration +processes and the DVC tool takes care of this. + +We are very interested in your opinion and feedback. Please post your comments +here or contact us on Twitter — [FullStackML](https://twitter.com/FullStackML). + +If you found this tool useful, **please “star” the +[DVC Github repository](https://github.com/iterative/dvc)**. diff --git a/content/blogs/2017-07-24-r-code-and-reproducible-model-development-with-dvc.md b/content/blogs/2017-07-24-r-code-and-reproducible-model-development-with-dvc.md new file mode 100644 index 0000000000..fd40fcfad6 --- /dev/null +++ b/content/blogs/2017-07-24-r-code-and-reproducible-model-development-with-dvc.md @@ -0,0 +1,225 @@ +--- +title: R code and reproducible model development with DVC +date: 2017-07-24 +description: > + There are a lot of example on how to use Data Version Control (DVC) with a + Python project. In this document I would like to see how it can be used with a + project in R. +descriptionLong: > + In this document we will briefly explore possibilities of a new open source + tool that could help with achieving code simplicity, readability and faster + model development. + + There are a lot of example on how to use Data Version Control (DVC) with a + Python project. In this document I would like to see how it can be used with a + project in R. +picture: 2017-07-24/post-image.png +pictureComment: DAG on R example +author: marija_ilic +commentsUrl: https://discuss.dvc.org/t/r-code-and-reproducible-model-development-with-dvc/298 +tags: + - RStats + - R + - Tutorial +--- + +[DVC](https://dvc.org) or Data Version Control tool — its idea is to track +files/data dependencies during model development in order to facilitate +reproducibility and track data files versioning. Most of the +[DVC tutorials](https://dvc.org/doc/tutorials) provide good examples of using +DVC with Python language. However, I realized that DVC is a +[language agnostic](https://en.wikipedia.org/wiki/Language-agnostic) tool and +can be used with any programming language. In this blog post, we will see how to +use DVC in R projects. + +## R coding — keep it simple and readable + +Each development is always a combination of following steps presented below: + +![Model development process](../uploads/images/2017-07-24/development-steps.png) +_Model development process_ + +Because of the specificity of the process — iterative development, it is very +important to improve some coding and organizational skills. For example, instead +of having one big R file with code it is better to split code in several logical +files — each responsible for one small piece of work. It is smart to track +history development with +[git](https://git-scm.com/book/en/v2/Getting-Started-About-Version-Control) +tool. Writing “_reusable code”_ is nice skill to have. Put comments in a code +can make our life easier. + +Beside git, next step in further improvements is to try out and work with DVC. +Every time when a change/commit in some of the codes and data sets is made, DVC +will reproduce new results with just one bash command on a linux (or Win +environment). It memorizes dependencies among files and codes so it can easily +repeat all necessary steps/codes instead of us worrying about the order. + +## R example — data and code clarification + +We’ll take an Python example from +[DVC tutorial](https://dvc.org/doc/tutorials/deep) (written by Dmitry Petrov) +and rewrite that code in R. With an example we’ll show how can DVC help during +development and what are its possibilities. + +Firstly, let’s initialize git and dvc on mentioned example and run our codes for +the first time. After that we will simulate some changes in the codes and see +how DVC works on reproducibility. + +R codes can be downloaded from the +[Github repository](https://github.com/Zoldin/R_AND_DVC). A brief explanation of +the codes is presented below: + +**parsingxml.R** — it takes xml that we downloaded from the web and creates +appropriate csv file. + +`gist:Zoldin/47536af63182a0e8daf37a7b989e2e8d#parsingxml.R` + +**train_test_spliting.R** — stratified sampling by target variable (here we are +creating test and train data set) + +`gist:Zoldin/7591c47ce5988cbe087e0038c9a850b9#train_test_splitting.R` + +**featurization.R** — text mining and tf-idf matrix creation. In this part we +are creating predictive variables. + +`gist:Zoldin/9e79c047fd8ad7aa6596b0682aca83c6#featurization.R` + +**train_model.R** — with created variables we are building logistic regression +(LASSO). + +`gist:Zoldin/1617b39f2acbde3cd486616ac442e7cf#train_model.R` + +**evaluate.R** — with trained model we are predicting target on test data set. +AUC is final output which is used as evaluation metric. + +`gist:Zoldin/bfc2d4ee449098a9ff64b99c3326e61d#evaluate.r` + +Firstly, codes from above we will download into the new folder (clone the +repository): + +```dvc +$ mkdir R_DVC_GITHUB_CODE +$ cd R_DVC_GITHUB_CODE + +$ git clone https://github.com/Zoldin/R_AND_DVC +``` + +## DVC installation and initialization + +On the first site it seemed that DVC will not be compatible to work with R +because of the fact that DVC is written in Python and as that needs/requires +Python packages and pip package manager. Nevertheless, the tool can be used with +any programming language, it is language agnostic and as such is excellent for +working with R. + +Dvc installation: + +```dvc +$ pip3 install dvc +$ dvc init +``` + +With code below 5 R scripts with `dvc run` are executed. Each script is started +with some arguments — input and output file names and other parameters (seed, +splitting ratio etc). It is important to use `dvc run` — with this command R +script are entering pipeline (DAG graph). + +```dvc +$ dvc import https://s3-us-west-2.amazonaws.com/dvc-public/data/tutorial/nlp/25K/Posts.xml.zip \ + data/ + +# Extract XML from the archive. +$ dvc run tar zxf data/Posts.xml.tgz -C data/ + +# Prepare data. +$ dvc run Rscript code/parsingxml.R \ + data/Posts.xml \ + data/Posts.csv + +# Split training and testing dataset. Two output files. +# 0.33 is the test dataset splitting ratio. +# 20170426 is a seed for randomization. +$ dvc run Rscript code/train_test_spliting.R \ + data/Posts.csv 0.33 20170426 \ + data/train_post.csv \ + data/test_post.csv + +# Extract features from text data. +# Two TSV inputs and two pickle matrices outputs. +$ dvc run Rscript code/featurization.R \ + data/train_post.csv \ + data/test_post.csv \ + data/matrix_train.txt \ + data/matrix_test.txt + +# Train ML model out of the training dataset. +# 20170426 is another seed value. +$ dvc run Rscript code/train_model.R \ + data/matrix_train.txt 20170426 \ + data/glmnet.Rdata + +# Evaluate the model by the testing dataset. +$ dvc run Rscript code/evaluate.R \ + data/glmnet.Rdata \ + data/matrix_test.txt \ + data/evaluation.txt + +# The result. +$ cat data/evaluation.txt +``` + +## Dependency flow graph on R example + +Dependency graph is shown on picture below: + +![Dependency graph](../uploads/images/2017-07-24/dependency-graph.png)_Dependency +graph_ + +DVC memorizes this dependencies and helps us in each moment to reproduce +results. + +For example, lets say that we are changing our training model — using ridge +penalty instead of lasso penalty (changing alpha parameter to `0`). In that case +will change/modify `train_model.R` job and if we want to repeat model +development with this algorithm we don’t need to repeat all steps from above, +only steps marked red on a picture below: + +![](../uploads/images/2017-07-24/marked-steps.png) + +DVC knows based on DAG graph that changed `train_model.R` file will only change +following files: `Glmnet.RData` and `Evaluation.txt`. If we want to see our new +results we need to execute only `train_model.R` and `evaluate.R job`. It is cool +that we don’t have to think all the time what we need to repeat (which steps). +`dvc repro` command will do that instead of us. Here is a code example : + +```dvc +$ vi train_model.R +$ git commit -am "Ridge penalty instead of lasso" +$ dvc repro data/evaluation.txt + +Reproducing run command for data item data/glmnet.Rdata. Args: Rscript code/train_model.R data/matrix_train.txt 20170426 data/glmnet.Rdata +Reproducing run command for data item data/evaluation.txt. Args: Rscript code/evaluate.R data/glmnet.Rdata data/matrix_test.txt data/evaluation.txt + +$ cat data/evaluation.txt +"AUC for the test file is : 0.947697381983095" +``` + +`dvc repro` always re executes steps which are affected with the latest +developer changes. It knows what needs to be reproduced. + +DVC can also work in an _"multi-user environment”_ . Pipelines (dependency +graphs) are visible to others colleagues if we are working in a team and using +git as our version control tool. Data files can be shared if we set up a cloud +and with _dvc sync_ we specify which data can be shared and used for other +users. In that case other users can see the shared data and reproduce results +with those data and their code changes. + +## Summary + +DVC tool improves and accelerates iterative development and helps to keep track +of ML processes and file dependencies in the simple form. On the R example we +saw how DVC memorizes dependency graph and based on that graph re executes only +jobs that are related to the latest changes. It can also work in multi-user +environment where dependency graphs, codes and data can be shared among multiple +users. Because it is language agnostic, DVC allows us to work with multiple +programming languages within a single data science project. diff --git a/content/blogs/2017-07-27-data-version-control-in-analytics-devops-paradigm.md b/content/blogs/2017-07-27-data-version-control-in-analytics-devops-paradigm.md new file mode 100644 index 0000000000..b74569e551 --- /dev/null +++ b/content/blogs/2017-07-27-data-version-control-in-analytics-devops-paradigm.md @@ -0,0 +1,191 @@ +--- +title: Data Version Control in Analytics DevOps Paradigm +date: 2017-07-27 +description: > + Why DevOps matters in data science, what specific challenges data scientists + face in the day to day work, and how do we setup a better environment for the + team. +descriptionLong: > + The eternal dream of almost every Data Scientist today is to spend all the + time exploring new datasets, engineering new features, inventing and + validating cool new algorithms and strategies. However, daily routines of a + Data Scientist include raw data pre-processing, dealing with infrastructure, + bringing models to production. That's where good DevOps practices and skills + are essential and will certainly be beneficial for industrial Data Scientists + as they can address the above-mentioned challenges in a self-service manner. +picture: 2017-07-27/post-image.jpeg +author: george_vyshnya +commentsUrl: https://discuss.dvc.org/t/data-version-control-in-analytics-devops-paradigm/297 +tags: + - DevOps + - Company +--- + +## Data Science and DevOps Convergence + +The primary mission of DevOps is to help the teams to resolve various Tech Ops +infrastructure, tools and pipeline issues. + +At the other hand, as mentioned in the conceptual review by +[Forbes](https://www.forbes.com/sites/teradata/2016/11/14/devops-for-data-science-why-analytics-ops-is-key-to-value/) +in November 2016, the industrial analytics is no more going to be driven by data +scientists alone. It requires an investment in DevOps skills, practices and +supporting technology to move analytics out of the lab and into the business. +There are even +[voices](https://www.computing.co.uk/ctg/news/2433095/a-lot-of-companies-will-stop-hiring-data-scientists-when-they-realise-that-the-majority-bring-no-value-says-data-scientist) +calling Data Scientists to concentrate on agile methodology and DevOps if they +like to retain their jobs in business in the long run. + +## Why DevOps Matters + +The eternal dream of almost every Data Scientist today is to spend all (well, +almost all) the time in the office exploring new datasets, engineering decisive +new features, inventing and validating cool new algorithms and strategies. +However, reality is often different. One of the unfortunate daily routines of a +Data Scientist work is to do raw data pre-processing. It usually translates to +the challenges to + +1. **Pull all kinds of necessary data from a variety of sources** + + - Internal data sources like ERP, CRM, POS systems, or data from online + e-commerce platforms + + - External data, like weather, public holidays, Google trends etc. + +2. **Extract, transform, and load the data** + + - Relate and join the data sources + + - Aggregate and transform the data + +3. **Avoid technical and performance drawbacks** when everything ends up in + “one big table” at the end + +4. **Facilitate continuous machine learning and decision-making in a + business-ready framework** + + - Utilize historic data to train the machine learning models and algorithms + + - Use the current, up-to-date data for decision-making + + - Export back the resulting decisions/recommendations to review by business + stakeholders, either back into the ERP system or some other data warehouse + +Another big challenge is to organize **collaboration and data/model sharing** +inside and across the boundaries of teams of Data Scientists and Software +Engineers. + +DevOps skills as well as effective instruments will certainly be beneficial for +industrial Data Scientists as they can address the above-mentioned challenges in +a self-service manner. + +## Can DVC Be a Solution? + +[Data Version Control](https://dvc.org) or simply DVC comes to the scene +whenever you start looking for effective DevOps-for-Analytics instruments. + +DVC is an open source tool for data science projects. It makes your data science +projects reproducible by automatically building data dependency graph (DAG). +Your code and the dependencies could be easily shared by Git, and data — through +cloud storage (AWS S3, GCP) in a single DVC environment. + +> Although DVC was created for machine learning developers and data scientists +> [originally](https://dvc.org/doc/understanding-dvc/what-is-dvc), it appeared +> to be useful beyond it. Since it brings proven engineering practices to not +> well defined ML process, I discovered it to have enormous potential as an +> Analytical DevOps instrument. + +It clearly helps to manage a big fraction of DevOps issues in daily Data +Scientist routines + +1. **Pull all kinds of necessary data from a variety of sources**. Once you + configure and script your data extraction jobs with DVC, it will be + persistent and operable across your data and service infrastructure + +2. **Extract, transform, and load the data**. ETL is going to be easy and + repeatable once you configure it with DVC scripting. It will become a solid + pipeline to operate without major supportive effort. Moreover, it will track + all changes and trigger an alert for updates in the pipeline steps via DAG. + +3. **Facilitate continuous machine learning and decision-making.** The part of + the pipeline facilitated through DVC scripting can be jobs to upload data + back to any transactional system (like ERP, ERM, CRM etc.), warehouse or data + mart. It will then be exposed to business stakeholders to make intelligent + data-driven decisions. + +4. **Share your algorithms and data**. Machine Learning modeling is an iterative + process and it is extremely important to keep track of your steps, + dependencies between the steps, dependencies between your code and data files + and all code running arguments. This becomes even more important and + complicated in a team environment where data scientists’ collaboration takes + a serious amount of the team’s effort. DVC will be the arm to help you with + it. + +One of the ‘juicy’ features of DVC is ability to support multiple technology +stacks. Whether you prefer R or use promising Python-based implementations for +your industrial data products, DVC will be able to support your pipeline +properly. You can see it in action for both +[Python-based](https://blog.dvc.org/how-data-scientists-can-improve-their-productivity) +and +[R-based](https://blog.dvc.org/r-code-and-reproducible-model-development-with-dvc) +technical stacks. + +As such, DVC is going to be one of the tools you would enjoy to use if/when you +embark on building continual analytical environment for your system or across +your organization. + +## Continual Analytical Environment and DevOps + +Building a production pipeline is quite different from building a +machine-learning prototype on a local laptop. Many teams and companies face the +challenges there. + +At the bare minimum, the following requirements shall be met when you move your +solution into production + +1. Periodic re-training of the models/algorithms + +2. Ease of re-deployment and configuration changes in the system + +3. Efficiency and high performance of real-time scoring the new out-of-sample + observations + +4. Availability of the monitor model performance over time + +5. Adaptive ETL and ability to manage new data feeds and transactional systems + as data sources for AI and machine learning tools + +6. Scaling to really big data operations + +7. Security and Authorized access levels to different areas of the analytical + systems + +8. Solid backup and recovery processes/tools + +This goes into the territory traditionally inhabited by DevOps. Data Scientists +should ideally learn to handle the part of those requirements themselves or at +least be informative consultants to classical DevOps gurus. + +DVC can help in many aspects of the production scenario above as it can +orchestrate relevant tools and instruments through its scripting. In such a +setup, DVC scripts will be sharable manifestation (and implementation) of your +production pipeline where each step can be transparently reviewed, easily +maintained, and changed as needed over time. + +## Will DevOps Be Captivating? + +If you are further interested in understanding the ever-proliferating role of +DevOps in the modern Data Science and predictive analytics in business, there +are good resources for your review below + +1. [DevOps For Data Science: Why Analytics Ops Is Key To Value](https://www.forbes.com/sites/teradata/2016/11/14/devops-for-data-science-why-analytics-ops-is-key-to-value/) + (Forbes, Nov 14, 2016) + +2. [Bridging the Gap Between Data Science and DevOps](https://www.packtpub.com/books/content/bridging-gap-between-data-science-and-devops) + +3. [Is DevOps Making Life Better for Data Scientists?](https://devops.com/devops-life-better-data-scientists/) + +By any mean, DVC is going to be a useful instrument to fill the multiple gaps +between the classical in-lab old-school data science practices and growing +demands of business to build solid DevOps processes and workflows to streamline +mature and persistent data analytics. diff --git a/content/blogs/2017-08-23-ml-model-ensembling-with-fast-iterations.md b/content/blogs/2017-08-23-ml-model-ensembling-with-fast-iterations.md new file mode 100644 index 0000000000..e225c2e560 --- /dev/null +++ b/content/blogs/2017-08-23-ml-model-ensembling-with-fast-iterations.md @@ -0,0 +1,240 @@ +--- +title: ML Model Ensembling with Fast Iterations +date: 2017-08-23 +description: > + Here we'll talk about tools that help tackling common technical challenges of + building pipelines for the ensemble learning. +descriptionLong: > + In many real-world Machine Learning projects, there is a need to ensemble + complex models as well as maintain pipelines. As we will demonstrate, DVC is a + good tool that helps tackling common technical challenges of building + pipelines for the ensemble learning. +picture: 2017-08-23/post-image.png +author: george_vyshnya +commentsUrl: https://discuss.dvc.org/t/ml-model-ensembling-with-fast-iterations/296 +tags: + - Best Practices + - Model Ensembling + - R + - Tutorial +--- + +In a model ensembling setup, the final prediction is a composite of predictions +from individual machine learning algorithms. To make the best model composite, +you have to try dozens of combinations of weights for the model set. It takes a +lot of time to come up with the best one. That is why the iteration speed is +crucial in the ML model ensembling. We are going to make our research +reproducible by using [Data Version Control](http://dvc.org) tool - +([DVC](http://dvc.org)). It provides the ability to quickly re-run and replicate +the ML prediction result by executing just a single command `dvc repro`. + +As we will demonstrate, DVC is a good tool that helps tackling common technical +challenges of building pipelines for the ensemble learning. + +## Project Overview + +In this case, we will build an R-based solution to attack the +supervised-learning regression problem to predict win sales per +[Predict Wine Sales](https://inclass.kaggle.com/c/pred-411-2016-04-u3-wine/) +Kaggle competition. + +An ensemble prediction methodology will be used in the project. The weighted +ensemble of three models will be implemented, trained, and predicted from +(namely, these are Linear Regression, `GBM`, and `XGBoost`). + +![](../uploads/images/2017-08-23/ensemble-prediction-methodology.png) + +If properly designed and used, ensemble prediction can perform much better then +predictions of individual machine learning models composing the ensemble. + +Prediction results will be delivered in a format of output CSV file that is +specified in the requirements to the +[Predict Wine Sales](https://inclass.kaggle.com/c/pred-411-2016-04-u3-wine/) +Kaggle competition (so called Kaggle submission file). + +## Important Pre-Requisites + +In order to try the materials of this +[repository](https://github.com/gvyshnya/DVC_R_Ensemble) in your environment, +the following software should be installed on your machine + +- **_Python 3_** runtime environment for your OS (it is required to run DVC + commands in the batch files) + +- **_DVC_** itself (you can install it as a python package by simply doing the + standard command in your command line prompt: `pip install dvc`) + +- **_R_** **_3.4.x_** runtime environment for your OS + +- **_git_** command-line client application for your OS + +## Technical Challenges + +The technical challenges of building the ML pipeline for this project were to +meet business requirements below + +- Ability to conditionally trigger execution of 3 different ML prediction models + +- Ability to conditionally trigger model ensemble prediction based on + predictions of those 3 individual models + +- Ability to specify weights of each of the individual model predictions in the + ensemble + +- Quick and fast redeployment and re-run of the ML pipeline upon frequent + reconfiguration and model tweaks + +- Reproducibility of the pipeline and forecasting results across the multiple + machines and team members + +The next sections below will explain how these challenges are addressed in the +design of ML pipeline for this project. + +## ML Pipeline + +The ML pipeline for this project is presented in the diagram below + +![](../uploads/images/2017-08-23/ml-pipeline.png) + +As you can see, the essential implementation of the solution is as follows + +- [`preprocessing.R`](https://gist.github.com/gvyshnya/443424775b0150baac774cc6cf3cb1cc) + handles all aspects of data manipulations and pre-processing (reading training + and testing data sets, removing outliers, imputing NAs etc.) as well as stores + refined training and testing set data as new files to reuse by model scripts + +- 3 model scripts implement training and forecasting algorithms for each of the + models selected for this project + ([`LR.R`](https://gist.github.com/gvyshnya/7ec76316c24bc1b4f595ef1256f52d3a), + [`GBM.R`](https://gist.github.com/gvyshnya/50e5ea3efa9771d2e7cc121c2f1a04e4), + [`xgboost.R`](https://gist.github.com/gvyshnya/2e5799863f02fec652c194020da82dd3)) + +- [`ensemble.R`](https://gist.github.com/gvyshnya/84379d6a68fd085fe3a26aabad453e55) + is responsible for the weighted ensemble prediction and the final output of + the Kaggle submission file + +- `config.R` is responsible for all of the conditional logic switches needed in + the pipeline (it is included as a source to all of modeling and ensemble + prediction scripts, to get this done) + +There is a special note about lack of feature engineering for this project. It +was an intended specification related to the specifics of the dataset. The +existing features were quite instrumental to predict the target values ‘as is’. +Therefore it had been decided to follow the well-known +[Pareto principle](https://en.wikipedia.org/wiki/Pareto_principle) (interpreted +as “**_20% of efforts address 80% of issues_**”, in this case) and not to spend +more time on it. + +**_Note_**: all `R` and batch files mentioned throughout this blog post are +available online in a separate GitHub +[repository](https://github.com/gvyshnya/DVC_R_Ensemble). You will be also able +to review more details on the implementation of each of the machine learning +prediction models there. + +### Pipeline Configuration Management + +All of the essential tweaks to conditional machine learning pipeline for this +project is managed by a configuration file. For ease of its use across solution, +it was implemented as an R code module (`config.R`), to be included to all model +training and forecasting. Thus the respective parameters (assigned as R +variables) will be retrieved by the runnable scripts, and the conditional logic +there will be triggered respectively. + +This file is not intended to run from a command line (unlike the rest of the R +scripts in the project). + +`gist:gvyshnya/918e94b06ebf222f6bb56ed26a5f44ee#config.R` + +### Why Do We Need DVC? + +As we all know, there is no way to build the ideal ML model with sound +prediction accuracy from the very beginning. You will have to continuously +adjust your algorithm/model implementations based on the cross-validation +appraisal until you yield the blooming results. This is especially true in the +ensemble learning where you have to constantly tweak not only parameters of the +individual prediction models but also the settings of the ensemble itself + +- changing ensemble composition — adding or removing individual prediction + models + +- changing model prediction weights in the resulting ensemble prediction + +Under such a condition, DVC will help you to manage your ensemble ML pipeline in +a really solid manner. Let’s consider the following real-world scenario + +- Your team member changes the settings of `GBM` model and resubmit its + implementation to (this is emulated by the commit + [#8604103f0](https://github.com/gvyshnya/DVC_R_Ensemble/commit/27825d0732f72f07e7e4e48548ddb8a8604103f0), + check sum `27825d0`) + +- You rerun the entire ML pipeline on your computer, to get the newest + predictions from `GBM` as well as the updated final ensemble prediction + +- The results of the prediction appeared to be still not optimal thus someone + changes the weights of individual models in the ensemble, assigning `GBM` + higher weight vs. `xgboost` and `LR` + +- After the ensemble setup changes committed (and updated `config.R` appeared in + the repository, as emulated by the commit + [#eb97612ce](https://github.com/gvyshnya/DVC_R_Ensemble/commit/5bcbe115afcb24886abb4734ff2da42eb97612ce), + check sum `5bcbe11`), you re-run the model predictions and the final ensemble + prediction on your machine once again + +All that you need to do to handle the changes above is simply to keep running +your **DVC** commands per the script developed (see the section below). You do +not have to remember or know explicitly the changes being made into the project +codebase or its pipeline configuration. **DVC** will automatically check out +latest changes from the repo as well as make sure it runs only those steps in +the pipeline that were affected by the recent changes in the code modules. + +### Orchestrating the Pipeline : DVC Command File + +After we developed individual R scripts needed by different steps of our Machine +Learning pipeline, we orchestrate it together using DVC. + +Below is a batch file illustrating how DVC manages steps of the machine learning +process for this project + +`gist:gvyshnya/7f1b8262e3eb7a8b3c16dbfd8cf98644#dvc.bat` + +If you then further edit ensemble configuration setup in `code/config.R`, you +can simply leverage the power of DVC as for automatic dependencies resolving and +tracking to rebuild the new ensemble prediction as follows + +`gist:gvyshnya/9d80e51ba3d7aa5bd37d100ed82376ee` + +## Summary + +In this blog post, we worked through the process of building an ensemble +prediction pipeline using DVC. The essential key features of that pipeline were +as follows + +- **_reproducibility_** — everybody on a team can run it on their premise + +- **_separation of data and code_** — this ensured everyone always runs the + latest versions of the pipeline jobs with the most up-to-date ‘golden copy’ of + training and testing data sets + +The helpful side effect of using DVC was you stop keeping in mind what was +changed on every step of modifying your project scripts or in the pipeline +configuration. Due to it maintaining the dependencies graph (DAG) automatically, +it automatically triggered the only steps that were affected by the particular +changes, within the pipeline job setup. It, in turn, provides the capability to +quickly iterate through the entire ML pipeline. + +> As DVC brings proven engineering practices to often suboptimal and messy ML +> processes as well as helps a typical Data Science project team to eliminate a +> big chunk of common +> [DevOps overheads](https://blog.dataversioncontrol.com/data-version-control-in-analytics-devops-paradigm-35a880e99133), +> I found it extremely useful to leverage DVC on the industrial data science and +> predictive analytics projects. + +## Further Reading + +1. [Ensemble Learning and Prediction Introduction](https://en.wikipedia.org/wiki/Ensemble_learning) + +2. [Using DVC in Machine Learning projects in Python](https://blog.dataversioncontrol.com/data-version-control-beta-release-iterative-machine-learning-a7faf7c8be67) + +3. [Using DVC in Machine Learning projects in R](https://blog.dataversioncontrol.com/r-code-and-reproducible-model-development-with-dvc-1507a0e3687b) + +4. [Kaggle Ensembling Guide](https://mlwave.com/kaggle-ensembling-guide/) diff --git a/content/blogs/2017-09-26-best-practices-of-orchestrating-python-and-r-code-in-ml-projects.md b/content/blogs/2017-09-26-best-practices-of-orchestrating-python-and-r-code-in-ml-projects.md new file mode 100644 index 0000000000..0322f7d956 --- /dev/null +++ b/content/blogs/2017-09-26-best-practices-of-orchestrating-python-and-r-code-in-ml-projects.md @@ -0,0 +1,262 @@ +--- +title: Best practices of orchestrating Python and R code in ML projects +date: 2017-09-26 +description: > + What is the best way to integrate R and Python languages in one data science + project? What are the best practices? +descriptionLong: > + Today, data scientists are generally divided among two languages — some prefer + R, some prefer Python. I will try to find an answer to a question: “What is + the best way to integrate both languages in one data science project? What are + the best practices?” +picture: 2017-09-26/post-image.jpg +pictureComment: | + Image was taken from + [this](http://intersog.com/blog/r-and-python-for-data-science-worthy-opponents/) + page +author: marija_ilic +commentsUrl: https://discuss.dvc.org/t/best-practices-of-orchestrating-python-and-r-code-in-ml-projects/295 +tags: + - R + - Python + - Tutorial + - Best Practices +--- + +Beside Git and shell scripting additional tools are developed to facilitate the +development of predictive model in a multi-language environments. For fast data +exchange between R and Python let’s use binary data file format +[Feather](https://blog.rstudio.com/2016/03/29/feather/). Another language +agnostic tool [DVC](http://dvc.org) can make the research reproducible — let’s +use DVC to orchestrate R and Python code instead of a regular shell scripts. + +## Machine learning with R and Python + +Both R and Python are having powerful libraries/packages used for predictive +modeling. Usually algorithms used for classification or regression are +implemented in both languages and some scientist are using R while some of them +preferring Python. In an example that was explained in previous +[tutorial](https://blog.dataversioncontrol.com/r-code-and-reproducible-model-development-with-dvc-1507a0e3687b) +target variable was binary output and logistic regression was used as a training +algorithm. One of the algorithms that could also be used for prediction is a +popular [Random Forest algorithm](https://en.wikipedia.org/wiki/Random_forest) +which is implemented in both programming languages. Because of performances it +was decided that Random Forest classifier should be implemented in Python (it +shows better performances than random forest package in R). + +## R example used for DVC demo + +We will use the same example from previous blog +[story](https://blog.dataversioncontrol.com/r-code-and-reproducible-model-development-with-dvc-1507a0e3687b), +add some Python codes and explain how Feather and DVC can simplify the +development process in this combined environment. + +Let’s recall briefly the R codes from previous tutorial: + +![R Jobs](../uploads/images/2017-09-26/r-jobs.png)_R Jobs_ + +Input data are StackOverflow posts — an XML file. Predictive variables are +created from text posts — relative importance +[tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) of words among all +available posts is calculated. With tf-idf matrices target is predicted and +lasso logistic regression for predicting binary output is used. AUC is +calculated on the test set and AUC metric is used on evaluation. + +Instead of using logistic regression in R we will write Python jobs in which we +will try to use random forest as training model. Train_model.R and evaluate.R +will be replaced with appropriate Python jobs. + +R codes can be seen +[here](https://blog.dataversioncontrol.com/r-code-and-reproducible-model-development-with-dvc-1507a0e3687b). + +Code for `train_model_Python.py` is presented below: + +`gist:Zoldin/b312897cc492608feef1eaeae7f6eabc#train_model_Python.py` + +Also here we are adding code for `evaluation_python_model.py`: + +`gist:Zoldin/9eef13632d0a9039fe9b0dba376516a4#evaluation_python_model.py` + +Let’s download necessary R and Python codes from above (clone the +[Github](https://github.com/Zoldin/R_AND_DVC) repository): + +```dvc +$ mkdir R_DVC_GITHUB_CODE +$ cd R_DVC_GITHUB_CODE + +$ git clone https://github.com/Zoldin/R_AND_DVC +``` + +Our dependency graph of this data science project look like this: + +![R (marked red) and Python (marked pink) jobs in one project](../uploads/images/2017-09-26/our-dependency-graph.png)_R +(marked red) and Python (marked pink) jobs in one project_ + +Now lets see how it is possible to speed up and simplify process flow with +Feather API and data version control reproducibility. + +## Feather API + +Feather API is designed to improve meta data and data interchange between R and +Python. It provides fast import/export of data frames among both environments +and keeps meta data information which is an improvement over data exchange via +csv/txt file format. In our example Python job will read an input binary file +that was produced in R with Feather api. + +Let’s install Feather library in both environments. + +For Python 3 on linux environment you can use cmd and pip3: + +```dvc +$ sudo pip3 install feather-format +``` + +For R it is necessary to install feather package: + +```R +install.packages(feather) +``` + +After successful installation we can use Feather for data exchange. + +Below is an R syntax for data frame export with Feather (featurization.R): + +```R +library(feather) + +write_feather(dtm_train_tfidf,args[3]) +write_feather(dtm_test_tfidf,args[4]) +print("Two data frame were created with Feather - one for train and one for test data set") +``` + +Python syntax for reading feather input binary files (train_model_python.py): + +```python +import feather as ft + +input = sys.argv[1] +df = ft.read_dataframe(input) +``` + +## Dependency graph with R and Python combined + +The next question what we are asking ourselves is why do we need DVC, why not +just use shell scripting? DVC automatically derives the dependencies between the +steps and builds +[the dependency graph (DAG)](https://en.wikipedia.org/wiki/Directed_acyclic_graph) +transparently to the user. Graph is used for reproducing parts/codes of your +pipeline which were affected by recent changes and we don’t have to think all +the time what we need to repeat (which steps) with the latest changes. + +Firstly, with `dvc run` command we will execute all jobs that are related to our +model development. In that phase DVC creates dependencies that will be used in +the reproducibility phase: + +```dvc +$ dvc import https://s3-us-west-2.amazonaws.com/dvc-public/data/tutorial/nlp/25K/Posts.xml.zip \ + data/ + +$ dvc run tar zxf data/Posts.xml.tgz -C data/ + +$ dvc run Rscript code/parsingxml.R \ + data/Posts.xml data/Posts.csv + +$ dvc run Rscript code/train_test_spliting.R \ + data/Posts.csv 0.33 20170426 \ + data/train_post.csv data/test_post.csv + +$ dvc run Rscript code/featurization.R \ + data/train_post.csv \ + data/test_post.csv data/matrix_train.feather \ + data/matrix_test.feather + +$ dvc run python3 code/train_model_python.py \ + data/matrix_train.feather \ + 20170426 data/model.p + +$ dvc run python3 code/evaluate_python_mdl.py \ + data/model.p data/matrix_test.feather \ + data/evaluation_python.txt +``` + +After this commands jobs are executed and included in DAG graph. Result (AUC +metrics) is written in evaluation_python.txt file: + +```dvc +$ cat data/evaluation_python.txt +AUC: 0.741432 +``` + +It is possible to improve our result with random forest algorithm. + +We can increase number of trees in the random forest classifier — from 100 to +500: + +```python +clf = RandomForestClassifier(n_estimators=500, + n_jobs=2, + random_state=seed) +clf.fit(x, labels) +``` + +After commited changes (in `train_model_python.py`) with `dvc repro` command all +necessary jobs for `evaluation_python.txt` reproduction will be re-executed. We +don’t need to worry which jobs to run and in which order. + +```dvc +$ git add . +$ git commit +[master a65f346] Random forest classifier — more trees added + 1 file changed, 1 insertion(+), 1 deletion(-) + +$ dvc repro data/evaluation_python.txt + +Reproducing run command for data item data/model.p. Args: python3 code/train_model_python.py data/matrix_train.txt 20170426 data/model.p +Reproducing run command for data item data/evaluation_python.txt. Args: python3 code/evaluate_python_mdl.py data/model.p data/matrix_test.txt data/evaluation_python.txt +Data item “data/evaluation_python.txt” was reproduced. +``` + +Beside code versioning, DVC also cares about data versioning. For example, if we +change data sets `train_post.csv` and `test_post.csv` (use different splitting +ratio) DVC will know that data sets are changed and `dvc repro` will re-execute +all necessary jobs for evaluation_python.txt. + +```dvc +$ dvc run Rscript code/train_test_spliting.R \ + data/Posts.csv 0.15 20170426 \ + data/train_post.csv \ + data/test_post.csv +``` + +Re-executed jobs are marked with red color: + +![](../uploads/images/2017-09-26/re-executed-jobs.png) + +```dvc +$ dvc run Rscript code/train_test_spliting.R \ + data/Posts.csv 0.15 20170426 \ + data/train_post.csv \ + data/test_post.csv + +$ dvc repro data/evaluation_python.txt + +Reproducing run command for data item data/matrix_train.txt. Args: Rscript — vanilla code/featurization.R data/train_post.csv data/test_post.csv data/matrix_train.txt data/matrix_test.txt +Reproducing run command for data item data/model.p. Args: python3 code/train_model_python.py data/matrix_train.txt 20170426 data/model.p +Reproducing run command for data item data/evaluation_python.txt. Args: python3 code/evaluate_python_mdl.py data/model.p data/matrix_test.txt data/evaluation_python.txt + +Data item “data/evaluation_python.txt” was reproduced. + +$ cat data/evaluation_python.txt +AUC: 0.793145 +``` + +New AUC result is 0.793145 which shows an improvement compared to previous +iteration. + +## Summary + +In data science projects it is often used R/Python combined programming. +Additional tools beside git and shell scripting are developed to facilitate the +development of predictive model in a multi-language environments. Using data +version control system for reproducibility and Feather for data interoperability +helps you orchestrate R and Python code in a single environment. diff --git a/content/blogs/2018-10-18-ml-best-practices-in-pytorch-dev-conf-2018.md b/content/blogs/2018-10-18-ml-best-practices-in-pytorch-dev-conf-2018.md new file mode 100644 index 0000000000..cf1cdc194a --- /dev/null +++ b/content/blogs/2018-10-18-ml-best-practices-in-pytorch-dev-conf-2018.md @@ -0,0 +1,157 @@ +--- +title: ML best practices in PyTorch dev conf 2018 +date: 2018-10-18 +description: > + In the Machine Learning (ML) field tools and techniques for best practices are + just starting to be developed. +descriptionLong: > + In the Machine Learning (ML) field tools and techniques for best practices are + just starting to be developed. At the PyTorch developer conference (PTDC-18), + several speakers including **Jerome Pesenti, VP of AI from Facebook** and + **Andrej Karpathy, Director of Tesla AI** spoke about best practices for + machine learning development. +picture: 2018-10-18/post-image.jpeg +pictureComment: | + The image source: + [link](https://blog.hubspot.com/customers/bid/109553/5-Homepage-Design-Best-Practices) +author: dmitry_petrov +commentsUrl: https://discuss.dvc.org/t/ml-best-practices-in-pytorch-dev-conf-2018/294 +tags: + - Machine Learning + - Best Practices + - PyTorch + - PTDC-18 + - Company +--- + +The issues discussed included applying traditional software development +techniques like unit testing, CI/CD systems, automated deployment, version +control, and more to the ML field. In this blog post, we will go over the best +practices ideas from PTDC-18 and the future of ML tool developments. + +## 1. Engineering practices from PyTorch developers + +In the PTDC-18 +[keynote speech](https://www.facebook.com/pytorch/videos/482401942168584/), +**Jerome Pesenti** described the motivation and goals of PyTorch project and +what the future of machine learning looks like. + +### 1.1. ML tooling future + +Regarding the future of ML, Jerome envisioned a “streamlined development, more +accessible tools, breakthrough hardware, and more”. Talking about the gap huge +gap between software engineering and ML engineering, Presenti said: + +> Machine learning engineering is where we were in Software Engineering 20 years +> ago. A lot of things still need to be invented. We need to figure out what +> testing means, what CD (continuous delivery) means, we need to develop tools +> and environments that people can develop **robust ML that does not have too +> many biases** and does not overfit. + +In that gap lives many opportunities to develop new tools and services. We in +the ML ecosystem are called upon to implement the future of machine learning +tools. Traditional software engineering has many useful tools and techniques +which can either be repurposed for Machine Learning development or used as a +source for ideas in developing new tools. + +### 1.2. PyTorch motivation + +PyTorch 1.0 implements one important engineering principle — “a seamless +transition from AI research to production”. It helps to move AI technology from +research into production as quickly as possible. In order to do that a few +challenges were solved: + +1. **Write code once** — not have to rewrite or re-optimize code to go from + research to prod. + +1. **Performance** — training model on large datasets. + +1. **Other languages** — not only Python which is great for prototyping but also + C++ and other languages. + +1. **Scaling** — deploy PyTorch at scale more easily. + +## 2. Engineering practices for software 2.0 + +### 2.1. Melting of software 2.0 and software 1.0 + +**Andrej Karpathy** from Tesla AI had a +[dedicated talk](https://www.facebook.com/pytorch/videos/169366590639145/) about +best engineering practices in ML. He drew a contrast between traditional +software development (software 1.0) with software utilizing Machine Learning +techniques (software 2.0), saying that + +> “software 2.0 code also has new feature demands, contains bugs, and requires +> iterations.” + +Meaning that ML development has a lifecycle similar to traditional software: + +> “When you are working with these [neural] networks **in production** you are +> doing much more than that [training and measuring models]. You maintaining the +> codebase and that codebase is alive is just like 1.0 code.” + +Machine Learning models need to grow and develop feature-by-feature, bugs need +to be found and fixed, and repeatable processes are a must, as in earlier non-ML +software development practices. + +### 2.2. Software 2.0 best practices + +Karpathy went on to describe how software 1.0 best practices can be used in +software 2.0 (ML modeling): + +1. **Test-driven development** — test/train dataset separation is not enough + since it describes only expected performance. Edge cases have to be tested to + ensure the model performs as required. That requires incorporating more + examples in datasets, or changing model architecture, or changing + optimization functions. + +1. **Continues Integration and Continues Delivery** (CI/CD) — Intelligently used + of CI/CD can propel a team into rapid agile development of software systems. + The phases of CI/CD jobs include: 1) ML model auto re-training when code or + dataset changes; 2) running unit-tests; 3) easy access to the last model; 4) + Auto-deployment to test and/or production systems. + +1. **Version Control** — track all the changes in datasets (labels), not only + code. + +1. Train a **single model** from scratch every time without using other + pre-trained models. (External pre-trained models don’t count as far as I + understand.) A chain of fine-tuning models very quickly disintegrates + codebase. In software 1.0 a single **monorepo** is an analog of a single + model which also helps to avoid disintegration. + +This list of best practices shows how serious Tesla AI is about robust software +which is not surprising for self-driving car area. Any company needs these +practices in order to organize a manageable ML development process. + +## 3. Data file-centric tools + +Frameworks and libraries like PyTorch make a significant step in machine +learning tooling and bringing the best practices. However, frameworks and +libraries might be not enough for many of the ML best practices. For example, +dataset versioning, ML model versioning, continuous integration (CI) and +continuous delivery (CD) requires manipulation and transferring data files. +These can be done in a **more efficient and natural way by data management +tools** and storage systems rather than libraries. + +The need for a machine learning artifact manipulation tool with **data +file-centric philosophy** was the major motivation behind open source project +that we created — Data Version Control (DVC) or [DVC.org](http://dvc.org). + +DVC connects Git with data files and machine learning pipelines which helps keep +version control on machine learning models and datasets using familiar Git +semantics coupled with the power of cloud storage systems such as Amazon’s S3, +Google’s GCS, Microsoft’s Azure or bare-metal servers accessed by SSH. + +If PyTorch helps in organizing code inside an ML project then data-centric tools +like DVC help organized different pieces of ML projects into a single workflow. +The machine learning future requires both types of tools — code level and data +file level. + +## Conclusion + +Thus far only the first steps have been taken toward using machine learning +tooling and the best machine learning practices. Mostly large companies are +using these practices because they faced the problems a while ago. Best +practices should be embraced by the entire industry which will help to bring +machine learning to a higher new level. diff --git a/content/blogs/2019-03-05-march-19-dvc-heartbeat.md b/content/blogs/2019-03-05-march-19-dvc-heartbeat.md new file mode 100644 index 0000000000..c2cb188612 --- /dev/null +++ b/content/blogs/2019-03-05-march-19-dvc-heartbeat.md @@ -0,0 +1,164 @@ +--- +title: March ’19 DVC❤️Heartbeat +date: 2019-03-05 +description: > + The very first issue of the DVC Heartbeat! News, links, Discord discussions + from the community. +descriptionLong: > + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: 2019-03-05/post-image.jpeg +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/march-19-dvc-heartbeat/293 +tags: + - Heartbeat + - Community +--- + +This is the very first issue of the DVC❤️Heartbeat. Every month we will be +sharing our news, findings, interesting reads, community takeaways, and +everything along the way. + +Some of those are related to our brainchild [DVC](https://dvc.org) and its +journey. The others are a collection of exciting stories and ideas centered +around ML best practices and workflow. + +## News and links + +We read a ton of articles and posts every day and here are a few that caught our +eye. Well-written, offering a different perspective and definitely worth +checking. + +- **[Data science is different now](https://veekaybee.github.io/2019/02/13/data-science-is-different/) + by [Vicki Boykis](https://veekaybee.github.io/)** + + + +> What is becoming clear is that, in the late stage of the hype cycle, data +> science is asymptotically moving closer to engineering, and the +> [skills that data scientists need](https://www.youtube.com/watch?v=frQeK8xo9Ls) +> moving forward are less visualization and statistics-based, and +> [more in line with traditional computer science curricula](https://tech.trivago.com/2018/12/03/teardown-rebuild-migrating-from-hive-to-pyspark/). + +- **[Data Versioning](https://emilygorcenski.com/post/data-versioning/) by + [Emily F. Gorcenski](https://emilygorcenski.com/)** + + + +> I want to explore how the degrees of freedom in versioning machine learning +> systems poses a unique challenge. I’ll identify four key axes on which machine +> learning systems have a notion of version, along with some brief +> recommendations for how to simplify this a bit. + +- **[Reproducibility in Machine Learning](https://blog.mi.hdm-stuttgart.de/index.php/2019/02/26/reproducibility-in-ml/) + by [Pascal Fecht](https://blog.mi.hdm-stuttgart.de/index.php/author/pf023/)** + + + +> ...the objective of this post is not to philosophize about the dangers and +> dark sides of AI. In fact, this post aims to work out common challenges in +> reproducibility for machine learning and shows programming differences to +> other areas of Computer Science. Secondly, we will see practices and workflows +> to create a higher grade of reproducibility in machine learning algorithms. + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We will be sifting through the issues and discussions and share the most +interesting takeaways. + +### Q: [Edit and define DVC files manually, in a Makefile style](https://discordapp.com/channels/485586884165107732/485586884165107734/541622187296161816) + +There is no separate guide for that, but it is very straight forward. See +[DVC file format](https://dvc.org/doc/user-guide/project-structure) description +for how DVC file looks inside in general. All `dvc add` or `dvc run` does is +just computing `md5` fields in it, that is all. You could write your DVC-file +and then run `dvc repro` that will run a command(if any) and compute all needed +checksums,[read more](https://discordapp.com/channels/485586884165107732/485586884165107734/541622187296161816). + +### Q: [Best practices to define the code dependencies](https://discordapp.com/channels/485586884165107732/485586884165107734/547424240677158915) + +There’s a ton of code in that project, and it’s very non-trivial to define the +code dependencies for my training stage — there are a lot of imports going on, +the training code is distributed across many modules, +[read more](https://discordapp.com/channels/485586884165107732/485586884165107734/547424240677158915) + +### Q: [Azure data lake support](https://discordapp.com/channels/485586884165107732/485586884165107734/548495589428428801) + +DVC officially only supports regular Azure blob storage. Gen1 Data Lake should +be accessible by the same interface, so configuring a regular azure remote for +DVC should work. Seems like Gen2 Data Lake +[has disable](https://discordapp.com/channels/485586884165107732/485586884165107734/550546413197590539) +blob API. If you know more details about the difference between Gen1 and Gen2, +feel free to join [our community](https://dvc.org/chat) and share this +knowledge. + +### Q: [What licence DVC is released under](https://discordapp.com/channels/485586884165107732/485596304961962003/542390986299539459) + +Apache 2.0. One of the [most common](https://opensource.org/licenses) and +permissible OSS licences. + +### Q: Setting up S3 compatible remote + +([Localstack](https://discordapp.com/channels/485586884165107732/485596304961962003/543445798868746278), +[wasabi](https://discordapp.com/channels/485586884165107732/485596304961962003/541466951474479115)) + +```dvc +$ dvc remote add upstream s3://my-bucket +$ dvc remote modify upstream region REGION_NAME +$ dvc remote modify upstream endpointurl +``` + +Find and click the `S3 API compatible storage` on +[this page](https://dvc.org/doc/commands-reference/remote/add) + +### Q: [Why DVC creates and updates `.gitignore` file?](https://discordapp.com/channels/485586884165107732/485596304961962003/543914550173368332) + +It adds your data files there, that are tracked by DVC, so that you don’t +accidentally add them to git as well you can open it with file editor of your +liking and see your data files listed there. + +### Q: [Managing data and pipelines with DVC on HDFS](https://discordapp.com/channels/485586884165107732/485596304961962003/545562334983356426) + +With DVC, you could connect your data sources from HDFS with your pipeline in +your local project, by simply specifying it as an external dependency. For +example let’s say your script `process.cmd` works on an input file on HDFS and +then downloads a result to your local workspace, then with DVC it could look +something like: + +```dvc +$ dvc run -d hdfs://example.com/home/shared/input \ + -d process.cmd \ + -o output process.cmd +``` + +[read more](https://discordapp.com/channels/485586884165107732/485596304961962003/545562334983356426). + +
+ +If you have any questions, concerns or ideas, let us know +[here](https://dvc.org/support) and our stellar team will get back to you in no +time. diff --git a/content/blogs/2019-04-18-april-19-dvc-heartbeat.md b/content/blogs/2019-04-18-april-19-dvc-heartbeat.md new file mode 100644 index 0000000000..6e76dee5c6 --- /dev/null +++ b/content/blogs/2019-04-18-april-19-dvc-heartbeat.md @@ -0,0 +1,264 @@ +--- +title: April ’19 DVC❤️Heartbeat +date: 2019-04-18 +description: > + DVC creator Dmitry Petrov is giving a talk on PyCon 2019 🎤, new DVC logo + design, new Discord discussions, interesting reads that caught our eye, and + everything along the way. +descriptionLong: > + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: 2019-04-18/post-image.jpeg +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/april-19-dvc-heartbeat/292 +tags: + - Heartbeat + - Community + - PyCon +--- + +## News and links + +We have some exciting news to share this month! + +DVC is going to [PyCon 2019](https://us.pycon.org/2019/)! It is the first +conference that we attend as a team. When we say ‘team’ — we mean it. Our +engineers are flying from all over the globe to get together offline and catch +up with fellow Pythonistas. + +The [speaker pipeline](https://us.pycon.org/2019/schedule/talks/list/) is +amazing! DVC creator Dmitry Petrov is giving a talk on +[Machine learning model and dataset versioning practices](https://us.pycon.org/2019/schedule/presentation/176/). + +Stop by our booth at the Startup Row on Saturday, May 4, reach out and let us +know that you are willing to chat, or simply find a person with a huge DVC owl +on their shirt! + +Speaking of the owls — DVC has done some rebranding recently and we love our new +logo. Special thanks to [99designs.com](https://99designs.com/) for building a +great platform for finding trusted designers. + +![](../uploads/images/2019-04-18/trusted-designers.png) + +DVC is moving fast (almost as fast as my two-year-old). We do our best to keep +up and totally love all the buzz in our community channels lately! + +Here is a number of interesting reads that caught our eye: + +- **[A walkthrough of DVC](https://blog.codecentric.de/en/2019/03/walkthrough-dvc/) + by [Bert Besser](https://www.linkedin.com/in/bert-besser-284564182/)** + + + +A great article about using DVC with a quite advanced scenario and docker. If +you haven’t had a chance to try [DVC.org](http://dvc.org/) yet — this is a great +comprehensive read on why you should do so right away. + +- **[The state of machine learning operations](https://github.com/EthicalML/state-of-mlops-2019) + by [Alejandro Saucedo](https://www.linkedin.com/in/axsaucedo/)** + + + +A short (only 8 minutes!) and inspiring talk by Alejandro Saucedo at FOSDEM. +Alejandro covers the key trends in machine learning operations, as well as most +recent open source tools and frameworks. Focused on reproducibility, monitoring +and explainability, this lightning talk is a great snapshot of the current state +of ML operations. + +- **[Interview with Kaggle Grandmaster, Senior Computer Vision Engineer at Lyft: Dr. Vladimir I. Iglovikov](https://hackernoon.com/interview-with-kaggle-grandmaster-senior-cv-engineer-at-lyft-dr-vladimir-i-iglovikov-9938e1fc7c) + by [Sanyam Bhutani](https://twitter.com/bhutanisanyam1)** + + + +> There is no way you will become Kaggle Master and not learn how to approach +> anew, the unknown problem in a fast hacking way with a very high number of +> iterations per unit of time. This skill in the world of competitive learning +> is the question of survival + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: [What are the system requirements to install DVC (type of operating system, dependencies of another application (as GIT), memory, cpu, etc).](https://discordapp.com/channels/485586884165107732/485596304961962003/552098155861114891) + +- It supports Windows, Mac, Linux. Python 2 and 3. + +- No specific CPU or RAM requirements — it’s a lightweight command line tool and + should be able run pretty much everywhere you can run Python. + +- It depends on a few Python libraries that it installs as dependencies (they + are specified in the + [`setup.py`](https://github.com/iterative/dvc/blob/master/setup.py)). + +- It does not depend on Git and theoretically could be run without any SCM. + Running it on top of a Git repository however is recommended and gives you an + ability to actually save history of datasets, models, etc (even though it does + not put them into Git directly). + +### Q: [Do I have to buy a server license to run DVC, do you have this?](https://discordapp.com/channels/485586884165107732/485596304961962003/560212552638791706) + +No server licenses for DVC. It is 100% free and open source. + +### Q: [What is the storage limit when using DVC?](https://discordapp.com/channels/485586884165107732/485596304961962003/560154903331340289) + +I am trying to version control datasets and models with >10 GB (Potentially even +bigger). Can DVC handle this? + +There is no limit. None enforced by DVC itself. It depends on the size of your +local or [remote storages](https://dvc.org/doc/commands-reference/remote). You +need to have some space available on S3, your SSH server or other storage you +are using to keep these data files, models and their version, which you would +like to store. + +### Q: [How does DVC know the sequence of stages to run](https://discordapp.com/channels/485586884165107732/485596304961962003/553731815228178433)? + +How does it connect them? Does it see that there is a dependency which is +outputted from the first run? + +DVC figures out the pipeline by looking at the dependencies and outputs of the +stages. For example, having the following: + +`gist:SvetaGr/a2a28fbc9db0a675422785bc5f925e14#heartbeat-dvc-run-2019-04.sh` + +you will end up with two stages: `download.dvc` and `duplicate.dvc`. The +download one will have `joke.txt` as an output . The duplicate one defined +`joke.txt` as a dependency, as it is the same file. DVC detects that and creates +a pipeline by joining those stages. + +You can inspect the content of each stage file +[here](https://dvc.org/doc/user-guide/project-structure) (they are human +readable). + +### Q: [Is it possible to use the same data of a remote in two different repositories?](https://discordapp.com/channels/485586884165107732/485596304961962003/560022999848321026) + +(e.g. in one repo `run dvc pull -r my_remote` to pull some data and running the +same command in a different git repo should also pull the same) + +Yes! It’s a frequent scenario for multiple repos to share remotes and even local +cache. DVC file serves as a link to the actual data. If you add the same DVC +file (e.g. `data.dvc`) to the new repo and do `dvc pull -r remotename data.dvc`- +it will fetch data. You have to use `dvc remote add` first to specify the +coordinates of the remote storage you would like to share in every project. +Alternatively (check out the question below), you could use `--global` to +specify a single default remote (and/or cache dir) per machine. + +### Q: [Could I set a global remote server, instead of config in each project?](https://discordapp.com/channels/485586884165107732/485586884165107734/559653121228275727) + +Use `--global` when you specify the remote settings. Then remote will be visible +for all projects on the same machine. `--global` — saves remote configuration to +the global config (e.g. `~/.config/dvc/config`) instead of a per project one — +`.dvc/config`. See more details +[here](https://dvc.org/doc/commands-reference/remote/add). + +### Q: [How do I version a large dataset in S3 or any other storage?](https://discordapp.com/channels/485586884165107732/485596304961962003/554679392823934977) + +We would recommend to skim through our +[get started](https://dvc.org/doc/get-started) tutorial, to summarize the data +versioning process of DVC: + +- You create stage (aka DVC) files by adding, importing files (`dvc add` / + `dvc import`) , or run a command to generate files: + +```dvc +$ dvc run --out file.csv "wget https://example.com/file.csv" +``` + +- This stage files are tracked by `git` + +- You use git to retrieve previous stage files (e.g. `git checkout v1.0`) + +- Then use `dvc checkout` to retrieve all the files related by those stage files + +All your files (with each different version) are stored in a `.dvc/cache` +directory, that you sync with a remote file storage (for example, S3) using the +`dvc push` or `dvc pull` commands (analogous to a `git push` / `git pull`, but +instead of syncing your `.git`, you are syncing your `.dvc` directory) on a +remote repository (let’s say an S3 bucket). + +### Q: [How do I move/rename a DVC-file?](https://discordapp.com/channels/485586884165107732/485596304961962003/558216007684980736) + +If you need to move your dvc file somewhere, it is pretty easy, even if done +manually: + +`gist:SvetaGr/b25a5b45773bf94d36e60d48462502f4#heartbeat-dvc-rename.sh` + +### Q: [I performed `dvc push` of a file to a remote. On the remote there is created a directory called `8f` with a file inside called `2ec34faf91ff15ef64abf3fbffa7ee`. The original CSV file doesn’t appear on the remote. Is that expected behaviour?](https://discordapp.com/channels/485586884165107732/485596304961962003/555431645402890255) + +This is an expected behavior. DVC saves files under the name created from their +checksum in order to prevent duplication. If you delete “pushed” file in your +project directory and perform `dvc pull`, DVC will take care of pulling the file +and renaming it to “original” name. + +Below are some details about how DVC cache works, just to illustrate the logic. +When you add a data source: + +`gist:SvetaGr/b69fa8ce36bcce00ecd69e7f2d7ccd2e#heartbeat-remote-file-naming.sh` + +It computes the (md5) checksum of the file and generates a DVC file with related +information: + +`gist:SvetaGr/110ae76df929654ec573ea9e4b1e1980#heartbeat-dvc-file-2019-04.yaml` + +The original file is moved to the cache and a link or copy (depending on your +filesystem) is created to replace it on your working space: + +`gist:SvetaGr/133cb93e5a21c6f21a86f8709ed39ea9#heartbeat-cache-structure-2019-04.sh` + +### Q: [Is it possible to integrate dvc with our in-house tools developed in Python?](https://discordapp.com/channels/485586884165107732/485586884165107734/553570391000481802) + +Absolutely! There are three ways you could interact with DVC: + +1. Use [subprocess](https://docs.python.org/3/library/subprocess.html) to launch + DVC + +2. Use `from dvc.main import main` and use it with regular CLI logic like + `ret = main(‘add’, ‘foo’)` + +3. Use our internal API (see `dvc/repo` and `dvc/command` in our source to get a + grasp of it). It is not officially public yet, and we don’t have any special + docs for it, but it is fairly stable and could definitely be used for a POC. + We’ll add docs and all the official stuff for it in the not-so-distant + future. + +### Q: [Can I still track the linkage between data and model without using `dvc run`](https://discordapp.com/channels/485586884165107732/485586884165107734/555750217522216990) and a graph of tasks? Basically what would like extremely minimal DVC invasion into my GIT repo for an existing machine learning application? + +There are two options: + +1. Use `dvc add` to track models and/or input datasets. It should be enough if + you use `git commit` on DVC files produced by `dvc add`. This is the very + minimum you can get with DVC and it does not require using DVC run. Check the + first part (up to the Pipelines/Add transformations section) of the DVC + [get started](https://dvc.org/doc/get-started). + +2. You could use `--no-exec` in `dvc run` and then just `dvc commit` and + `git commit` the results. That way you’ll get your DVC files with all the + linkages, without having to actually run your commands through DVC. + +If you have any questions, concerns or ideas, let us know +[here](https://dvc.org/support) and our stellar team will get back to you in no +time. diff --git a/content/blogs/2019-04-23-dvc-project-ideas-for-google-summer-of-docs-2019.md b/content/blogs/2019-04-23-dvc-project-ideas-for-google-summer-of-docs-2019.md new file mode 100644 index 0000000000..525bc52c8a --- /dev/null +++ b/content/blogs/2019-04-23-dvc-project-ideas-for-google-summer-of-docs-2019.md @@ -0,0 +1,227 @@ +--- +title: DVC project ideas for Google Season of Docs 2019 +date: 2019-04-23 +description: > + DVC.org is applying for Google Season of Docs — a new and unique program + sponsored by Google that pairs technical writers with open source projects to + collaborate on the open source project documentation. +descriptionLong: > + [DVC.org](https://dvc.org) is applying for [Google Season of + Docs](https://developers.google.com/season-of-docs/) — a new and unique + program sponsored by Google that pairs technical writers with open source + projects to collaborate on the open source project documentation. + + It’s happening for the first time in 2019 and we are excited about the + opportunity to be a part of it! +picture: 2019-04-23/post-image.png +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/dvc-project-ideas-for-google-season-of-docs-2019/291 +tags: + - Google Season of Docs + - Python + - Documentation + - Company +--- + +We strongly believe that well-shaped documentation is key for making the product +truly open. We have been investing lots of time and energy in improving our docs +lately. Being a team of 90% engineers we are eager to welcome the writers into +our team and our community. We are happy to share our experience, introduce them +to the world of open source and machine learning best practices, guide through +the OS contribution process and work together on improving our documentation. + +DVC was started in late 2017 by a data scientist and an engineer. It is now +growing pretty fast and though our in-house team is quite small, we have to +thank our contributors (more than 80 in both code and docs) for developing DVC +with us. When working with DVC the technical writer will not only get lots of +hands-on experience in writing technical docs, but will also immerse into DVC +community — a warm and welcoming gathering of ML and DS enthusiasts and an +invaluable source of inspiration and expertise in ML engineering. + +### About DVC + +DVC is a brainchild of a data scientist and an engineer, that was created to +fill in the gaps in the ML processes tooling and evolved into a successful open +source project. + +ML brings changes in development and research processes. These ML processes +require new tools for data versioning, ML pipeline versioning, resource +management for model training and others that haven’t been formalized. The +traditional software development tools do not fully cover ML team’s needs but +there are no good alternatives. It makes engineers to custom develop a new +toolset to manage data files, keep track of ML experiments and connect data and +source code together. The ML process becomes very fragile and requires tons of +tribal knowledge. + +We have been working on [DVC](http://DVC.org) by adopting best ML practices and +turning them into Git-like command line tool. DVC versions multi-gigabyte +datasets and ML models, make them shareable and reproducible. The tool helps to +organize a more rigorous process around datasets and the data derivatives. Your +favorite cloud storage (S3, GCS, or bare metal SSH server) could be used with +DVC as a data file backend. + +If you are interested in learning a little bit more about DVC and its journey, +here is a great interview with DVC creator in the Episode 206 of +Podcast.**init**. Listen to it +[HERE ](https://www.pythonpodcast.com/data-version-control-episode-206/)or read +the transcript +[HERE.](https://towardsdatascience.com/data-version-control-with-dvc-what-do-the-authors-have-to-say-3c3b10f27ee) + +### The state of DVC documentation + +DVC is a pretty young project, developed and maintained solely by engineers. As +many OS projects we started from the bottom and for a long time our +[documentation](https://dvc.org/doc) was a bunch of bits and pieces. Nowadays +improving documentation is one of our top priorities. We moved to the new +in-house built documentation engine and started working with several technical +writers. Certain parts have been tremendously improved recently, e.g. +[Get Started](https://dvc.org/doc/get-started) and +[certain parts of Commands Reference](https://dvc.org/doc/commands-reference/fetch) +. So far most of our documentation has been written majorly by the engineering +team and there is need for improving the overall structure and making some parts +more friendly from a new user perspective. We have mostly complete +[reference documentation](https://dvc.org/doc/commands-reference) for each +command, although some functions are missing good actionable examples. We also +have a [User Guide](https://dvc.org/doc/user-guide), however it is not in very +good shape. We strive for making our documentation clear and comprehensive for +users of various backgrounds and proficiency levels and this is where we do need +some fresh perspective. + +### How DVC documentation is built + +We have an open Github Apache-2 licensed repository for the +[DVC website](https://github.com/iterative/dvc.org), the documentation engine +and the [documentation files](https://github.com/iterative/dvc.org). The website +is built with Node.js + React, including the documentation engine (built +in-house). + +Each documentation page is a static Markdown file in the repository, e.g. +[example here](https://github.com/iterative/dvc.org/blob/main/content/docs/command-reference/index.md). +It is rendered dynamically in the browser, no preprocessing is required. It +means that tech writers or contributors need to write/edit a Markdown file, +create a pull request and merge it into the master branch of the +[repository.](https://github.com/iterative/dvc.org) The complete +[documentation contributing guide](https://github.com/iterative/dvc.org/blob/main/README.md#contributing) +describes the directory structure and locations for the different documentation +parts. + +### DVC’s approach to documentation work + +Documentation tasks and issues are maintained on our doc’s GitHub +[issue tracker](https://github.com/iterative/dvc.org/issues). Changes to the +documentation are made via pull requests on GitHub, and go through our standard +review process which is the same for documentation and code. A technical writer +would be trained in working with our current development process. It generally +means that tech writers or contributors need to write/edit a Markdown file, use +git and Github to create a pull request and publish it. The documentation +[contributing guide](https://github.com/iterative/dvc.org/blob/main/README.md#contributing) +includes style conventions and other details. Documentation is considered of the +same importance as code. Engineering team has a policy to write or update the +relevant sections if something new is released. If it’s something too involved +engineers may create a ticket and ask for help. There is one maintainer who is +responsible for doing final reviews and merging the changes. In this sense, our +documentation is very similar to any other open source project. + +## Project ideas for GSoD’19 + +We identified a number of ideas to work on and there are two major topics these +ideas fall into. Both topics are pretty broad and we don’t expect we can +completely cover them during this GSoD but hopefully we can make certain +progress. + +First of all, we want to bring more structure and logic to our documentation to +improve user onboarding experience. The goal is for a new user to have a clear +path they can follow and understand what takeaways each part of the +documentation provides. In particular, improving how +[Get Started](https://dvc.org/doc/get-started), +[Tutorials](https://dvc.org/doc/tutorial) and +[Examples](https://dvc.org/doc/tutorials/versioning) relate to each other, +restructuring the existing [User Guide](https://dvc.org/doc/user-guide) to +explain basic concepts, and writing more use cases that resonate with ML +engineers and data scientists. + +The other issue we would like to tackle is improving and expanding the existing +reference docs — commands descriptions, examples, etc. It involves filling in +the gaps and developing new sections, similar to +[this one](https://dvc.org/doc/commands-reference/fetch). We would also love to +see more illustrative materials. + +### Project 1: Improving and expanding User Guide + +**Description and details:** Reviewing, restructuring and filling major gaps in +the User Guide (introductory parts of the basic concepts of DVC), e.g. have a +look at [this ticket](https://github.com/iterative/dvc.org/issues/144) or +[this one](https://github.com/iterative/dvc.org/issues/53). + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +### Project 2: Expanding and developing new tutorials and use cases. + +**Description and details:** We already have some requests for more tutorials, +e.g. [this ticket](https://github.com/iterative/dvc.org/issues/96). Here is +another good [use case request](https://github.com/iterative/dvc.org/issues/194) +. If you are going to work on this project you would need some domain knowledge, +preferably some basic ML or data science experience. + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +### Project 3: Improving new user onboarding + +**Description and details:** Analyze and restructure user walkthrough across +[Get started](https://dvc.org/doc/get-started), +[Tutorials](https://dvc.org/doc/tutorial) and +[Examples](https://dvc.org/doc/tutorials/versioning). These three have one thing +in common — hands-on experience with DVC. If you choose this project, we will +work together to come up with a better location for the Examples (to move them +out of the Get Started shadow), and a better location for the Tutorials (to +reference external tutorials that were developed by our community members and +published on different platforms). + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +### Project 4: Improving commands reference + +**Description and details:** We will work on improving our +[Commands reference](https://dvc.org/doc/commands-reference) section. This +includes expanding and filling in the gaps. One of the biggest pain points right +now are Examples. Users want them to be +[easy to run and try](https://github.com/iterative/dvc.org/issues/198) and here +is a lot to be done in terms of improvement. We have a good example of how is +should be done [here](https://dvc.org/doc/commands-reference/fetch). + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +### Project 5: Describe and integrate “DVC packages” + +**Description and details:** Describe the brand new feature “DVC packages” and +integrate it with the rest of the documentation. We have been working hard to +release a few new commands to help with datasets management (have a look at +[this ticket](https://github.com/iterative/dvc/issues/1487)). It’s a major +feature that deserves its place in the Get Started, Use cases, Commands +Reference, etc. + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +The ideas we outline above are just an example of what we can work on. We are +open for any other suggestions and would like to work together with the +technical writer to make the contribution experience both useful and enjoyable +for all parties involved. If you have any suggestions or questions we would love +to hear from you => DVC.org/support and our DMs on +[Twitter](https://twitter.com/DVCorg) are always open! + +
+ +Special thanks to the [NumFOCUS](https://numfocus.org/) for the ideas list +inspiration. + +If you are a tech writer — check the +[Technical writer guide](https://developers.google.com/season-of-docs/docs/tech-writer-guide). +From April 30, 2019 you can see the list of participating open source +organizations on the [Season of Docs website](https://g.co/seasonofdocs). The +application period for technical writers opens on **May 29, 2019** and ends on +June 28, 2019. diff --git a/content/blogs/2019-05-21-may-19-dvc-heartbeat.md b/content/blogs/2019-05-21-may-19-dvc-heartbeat.md new file mode 100644 index 0000000000..db4be21e11 --- /dev/null +++ b/content/blogs/2019-05-21-may-19-dvc-heartbeat.md @@ -0,0 +1,301 @@ +--- +title: May ’19 DVC❤️Heartbeat +date: 2019-05-21 +description: > + DVC accepted into Google Season of Docs 🎉, Dmitry's talk at the O’Reilly AI + Conference, new portion of Discord gems, and articles either created or + brought to us by our community. +descriptionLong: > + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: 2019-05-21/post-image.jpeg +pictureComment: | + Kudos to [StickerMule.com](https://www.stickermule.com) for our amazing + stickers (and great customer service)! +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/may-19-dvc-heartbeat/290 +tags: + - Heartbeat + - Community + - Google Season of Docs +--- + +## News and links + +This section of DVC Heartbeat is growing with every new Issue and this is +already quite a good piece of news! + +One of the most exciting things we want to share this month is acceptance of DVC +into the [Google Season of Docs](https://developers.google.com/season-of-docs/). +It is a new and unique program sponsored by Google that pairs technical writers +with open source projects to collaborate and improve the open source project +documentation. You can find the outline of DVC vision and project ideas in +[this dedicated blogpost](https://blog.dataversioncontrol.com/dvc-project-ideas-for-google-summer-of-docs-2019-defe3a73b248) +and check the +[full list of participating open source organizations](https://developers.google.com/season-of-docs/docs/participants/). +Technically the +[program is starting in a few months](https://developers.google.com/season-of-docs/docs/timeline), +but there is already a fantastic increase in the amount of commits and +contributors, and we absolutely love it! + +The other important milestone for us was the first offline meeting with our +distributed remote team. Working side by side and having non-Zoom meetings with +the team was amazing. Joining our forces to prepare for the upcoming conferences +turned out to be the most valuable, educating and uniting experience for the +whole team. + +It’s a shame that our tech lead was unable to join us it due to another visa +denial. We do hope he will finally make it to the USA for the next big +conference. + +![](../uploads/images/2019-05-21/the-world-is-changing.png) + +While we were busy finalizing all the PyCon 2019 prep, our own +[Dmitry Petrov](https://twitter.com/FullStackML) flew to New York to speak at +the +[O’Reilly AI Conference](https://conferences.oreilly.com/artificial-intelligence/ai-ny) +about the +[Open Source tools for Machine Learning Models and Datasets versioning](https://www.oreilly.com/library/view/artificial-intelligence-conference/9781492050544/video324691.html). +Unfortunately the video is available for the registered users only (with a free +trial option) but you can have a look at Dmitry’s slides +[here](https://www.slideshare.net/DmitryPetrov15/dvc-oreilly-artificial-intelligence-conference-2019-new-york). + +![](../uploads/images/2019-05-21/iterative-ai-twitter.png) + +We renamed our Twitter! Our old handle was a bit misleading and we moved from +@Iterativeai to [@DVCorg](https://twitter.com/DVCorg) (yet keep the old one for +future projects). + +Our team is so happy every time we discover an article featuring DVC or +addressing one of the burning ML issues we are trying to solve. Here are some of +our favorite links from the past month: + +- **[Version Control For Your Machine Learning Projects — Episode 206](https://www.pythonpodcast.com/data-version-control-episode-206/)** + by **[Tobias Macey](https://www.linkedin.com/in/tmacey/)** + + + +> Version control has become table stakes for any software team, but for machine +> learning projects there has been no good answer for tracking all of the data +> that goes into building and training models, and the output of the models +> themselves. To address that need Dmitry Petrov built the Data Version Control +> project known as DVC. In this episode he explains how it simplifies +> communication between data scientists, reduces duplicated effort, and +> simplifies concerns around reproducing and rebuilding models at different +> stages of the projects lifecycle. + +- **Here is an + [article](https://towardsdatascience.com/data-version-control-with-dvc-what-do-the-authors-have-to-say-3c3b10f27ee) + by [Favio Vázquez](https://medium.com/@faviovazquez) with a transcript of this + podcast episode.** + + + +- **[Why Git and Git-LFS is not enough to solve the Machine Learning Reproducibility crisis](https://towardsdatascience.com/why-git-and-git-lfs-is-not-enough-to-solve-the-machine-learning-reproducibility-crisis-f733b49e96e8)** + + + +> With Git-LFS your team has better control over the data, because it is now +> version controlled. Does that mean the problem is solved? Earlier we said the +> “_key issue is the training data_”, but that was a lie. Sort of. Yes keeping +> the data under version control is a big improvement. But is the lack of +> version control of the data files the entire problem? No. + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: This might be [a favourite gem of ours ](https://discordapp.com/channels/485586884165107732/485598848111083531/572960640122224640) — our engineers are so fast that someone assumed they were bots. + +We feared that too until we met them in person. They appeared to be real (unless +bots also love Ramen now)! + +![](../uploads/images/2019-05-21/bots-also-love-ramen-now.png) + +### Q: [Is this the best way to track data with DVC when code and data are separate?](https://discordapp.com/channels/485586884165107732/485596304961962003/572974117351849997) Having being burned by this a couple of times, i.e accidentally pushing large files to GitHub, I now keep my code and data separate. + +Every time you run `dvc add` to start tracking some data artifact, its path is +automatically added to the `.gitignore` file, as a result it is hard to commit +it to git by mistake — you would need to explicitly modify the `.gitignore` +first. The feature to track some external data is called +[external outputs](https://dvc.org/doc/user-guide/managing-external-data) (if +all you need is to track some data artifacts). Usually it is used when you have +some data on S3 or SSH and don’t want to pull it into your working space, but +it’s working even when your data is located on the same machine outside of the +repository. + +### Q: [How do I wrap a step that downloads a file/directory into a DVC stage?](https://discordapp.com/channels/485586884165107732/485596304961962003/571342592508428289) I want to ensure that it runs only if file has no been downloaded yet + +Use `dvc import` to track and download the remote data first time and next time +when you do dvc repro if data has changed remotely. If you don’t want to track +remote changes (lock the data after it was downloaded), use `dvc run` with a +dummy dependency (any text file will do you do not touch) that runs an actual +wget/curl to get the data. + +### Q: [How do I show a pipeline that does not have a default Dvcfile?](https://discordapp.com/channels/485586884165107732/485596304961962003/570943786151313408) (e.g. I assigned all files names manually with `-f` in the `dvc run` command and I just don’t have `Dvcfile` anymore) + +Almost any command in DVC that deals with pipelines (set of DVC-files) accepts a +single stage as a target, for example: + +```dvc +$ dvc pipeline show — ascii model.dvc +``` + +### Q: [DVC hangs or I’m getting `database is locked` issue](https://discordapp.com/channels/485586884165107732/485596304961962003/570843482218823682) + +It’s a well known problem with NFS, CIFS (Azure) — they do not support file +locks properly which is required by the SQLLite engine to operate. The easiest +workaround — don’t create a DVC project on network attached partition. In +certain cases a fix can be made by changing mounting options, check +[this discussion](https://discordapp.com/channels/485586884165107732/485596304961962003/570276668694855690) +for the Azure ML Service. + +### Q: [How do I use DVC if I use a separate drive to store the data and a small/fast SSD to run computations?](https://discordapp.com/channels/485586884165107732/485596304961962003/570091809594671126) I don’t have enough space to bring data to my working space. + +An excellent question! The short answer is: + +```dvc +# To move your data cache to a big partition +$ dvc cache dir --local /path/to/an/external/partition + +# To enable symlinks/harldinks to avoid actual copying +$ dvc config cache.type reflink, hardlink, symlink, copy + +# To protect the cache +$ dvc config cache.protected true +``` + +The last one is highly recommended to make links in your working space read-only +to avoid corrupting the cache. Read more about different link types +[here](https://dvc.org/doc/user-guide/large-dataset-optimization). + +To add your data first time to the DVC cache, do a clone of the repository on a +big partition and run `dvc add` to add your data. Then you can do `git pull`, +`dvc pull` on a small partition and DVC will create all the necessary links. + +### Q: [Why I’m getting `Paths for outs overlap` error when I run `dvc add` or `dvc run`?](https://discordapp.com/channels/485586884165107732/485596304961962003/571335064374345749) + +Usually it means that a parent directory of one of the arguments for `dvc add` / +`dvc run` is already tracked. For example, you’ve added the whole datasets +directory already. And now you are trying to add a subdirectory, which is +already tracked as a part of the datasets one. No need to do that. You could +`dvc add datasets` or `dvc repro datasets.dvc` to save changes. + +### Q: [I’m getting `ascii codec can’t encode character` error on DVC commands when I deal with unicode file names](https://discordapp.com/channels/485586884165107732/485596304961962003/567310354766495747) + +[Check the locale settings you have](https://perlgeek.de/en/article/set-up-a-clean-utf8-environment) +(`locale` command in Linux). Python expects a locale that can handle unicode +printing. Usually it’s solved with these commands: `export LC_ALL=en_US.UTF-8` +and `export LANG=en_US.UTF-8`. You can place those exports into `.bashrc` or +other file that defines your environment. + +### Q: [Does DVC use the same logins `aws-cli` has when using an S3 bucket as its repo/remote storage](https://discordapp.com/channels/485586884165107732/485596304961962003/563149775340568576)? + +In short — yes, but it can be also configured. DVC is going to use either your +default profile (from `~/.aws/*`) or your env vars by default. If you need more +flexibility (e.g. you need to use different credentials for different projects, +etc) check out +[this guide](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) +to configure custom aws profiles and then you could use them with DVC using +these +[remote options](https://dvc.org/doc/commands-reference/remote/add#options). + +### Q: [How can I output multiple metrics from a single file?](https://discordapp.com/channels/485586884165107732/485596304961962003/566000729505136661) + +Let’s say I have the following in a file: + +```json +{ + “AUC_RATIO”: + { + “train”: 0.8922748258797667, + “valid”: 0.8561602726251776, + “xval”: 0.8843431199314923 + } +} +``` + +How can I show both `train` and `valid` without `xval`? + +You can use `dvc metrics show` command `--xpath` option and provide multiple +attribute names to it: + +```dvc +$ dvc metrics show metrics.json \ + --type json \ + --xpath AUC_RATIO[train,valid] + metrics.json: + 0.89227482588 + 0.856160272625 +``` + +### Q: [What is the quickest way to add a new dependency to a DVC-file?](https://discordapp.com/channels/485586884165107732/485596304961962003/566314479499870211) + +There are a few options to add a new dependency: + +- simply opening a file with your favorite editor and adding a dependency there + without md5. DVC will understand that that stage is changed and will re-run + and re-calculate md5 checksums during the next DVC repro; + +- use `dvc run --no-exec` is another option. It will rewrite the existing file + for you with new parameters. + +### Q: [Is there a way to add a dependency to a python package, so it runs a stage again if it imported the updated library?](https://discordapp.com/channels/485586884165107732/485596304961962003/566315265646788628) + +The only recommended way so far would be to somehow make DVC know about your +package’s version. One way to do that would be to create a separate stage that +would be dynamically printing version of that specific package into a file, that +your stage would depend on: + +```dvc +$ dvc run -o mypkgver 'pip show mypkg > mypkgver’ +$ dvc run -d mypkgver -d ... -o .. mycmd +``` + +### Q: [Is there anyway to forcibly recompute the hashes of dependencies in a pipeline DVC-file?](https://discordapp.com/channels/485586884165107732/485596304961962003/564807276146458624) + +E.g. I made some whitespace/comment changes in my code and I want to tell DVC +“it’s ok, you don’t have to recompute everything”. + +Yes, you could `dvc commit -f`. It will save all current checksum without +re-running your commands. + +### Q: [I have projects that use data that’s stored in S3. I never have data locally to use `dvc push`, but I would like to have this data version controlled.](https://discordapp.com/channels/485586884165107732/485596304961962003/563352000281182218) Is there a way to use the features of DVC in this use case? + +Yes! This DVC features is called +[external outputs](https://dvc.org/doc/user-guide/large-dataset-optimization) +and +[external dependencies](https://dvc.org/doc/user-guide/external-dependencies). +You can use one of them or both to track, process, and version your data on a +cloud storage without downloading it locally. + +
+ +If you have any questions, concerns or ideas, let us know +[here](https://dvc.org/support) and our stellar team will get back to you in no +time! diff --git a/content/blogs/2019-06-26-june-19-dvc-heartbeat.md b/content/blogs/2019-06-26-june-19-dvc-heartbeat.md new file mode 100644 index 0000000000..408becc0a0 --- /dev/null +++ b/content/blogs/2019-06-26-june-19-dvc-heartbeat.md @@ -0,0 +1,233 @@ +--- +title: June ’19 DVC❤️Heartbeat +date: 2019-06-26 +description: > + First DVC user survey, sharing our PyCon experience, new portion of Discord + discussions, and articles either created or brought to us by our community. +descriptionLong: > + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: 2019-06-26/post-image.png +pictureComment: | + Thanks to the amazing [Signaturit Tech](https://twitter.com/SignaturitTech) + team for this + [photo](https://twitter.com/SignaturitTech/status/1127927520140120065?s=20)! +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/june-19-dvc-heartbeat/289 +tags: + - PyCon + - Heartbeat + - Community +--- + +## News and links + +We want to start by saying to our users, contributors, and community members how +grateful we are for the fantastic work you are doing contributing to DVC, giving +talks about DVC, sharing your feedback, use cases and your concerns. A huge +thank you to each of you from the DVC team! + +We would love to give back and support any positive initiative around DVC — just +let us know [here](https://dvc.org/support) and we will send you a bunch of cool +swag, connect to a tech expert or find another way to support your project. Our +[DMs on Twitter](https://twitter.com/DVCorg) are open, too. + +**And if you have 4 minutes to spare, we are conducting out first +[DVC user survey](https://docs.google.com/forms/d/1tmn8YHLUkeSi5AIq4DGJi28iZy9HTazl6DWKe3Hxpnc/edit?ts=5cfc47c2) +and would love to hear from you!** + +Aside from admiring great DVC-related content from our users we have one more +reason to particularly enjoy the past month — DVC team went to Cleveland to +attend [PyCon 2019](https://us.pycon.org/2019/about/) and it was a blast! + +![](../uploads/images/2019-06-26/cleveland-to-attend-pycon-2019.jpeg) _Amazing +[Jennifer](https://github.com/sureL) and her artwork for our +[SupportOpenSource](https://twitter.com/hashtag/SupportOpenSource) contest_ + +We had it all. Running our first ever conference booth, leading an impromptu +unconference discussion and arranging some cool +[#SupportOpenSource](https://twitter.com/hashtag/SupportOpenSource?src=hashtag_click) +activities was great! Last-minute accommodation cancellations, booth equipment +delivery issues, and being late for our very own talk was not so great. Will be +sharing more about it in a separate blogpost soon. + +https://youtu.be/jkfh2PM5Sz8 + +Here is [Dmitry Petrov](https://twitter.com/FullStackML)’s PyCon +[talk](https://www.youtube.com/watch?v=jkfh2PM5Sz8) and +[slides](https://docs.google.com/presentation/d/1CYt0w8WoZAXiQEtVDVDsTnQumzdZx91v32MwEK20R-E/edit) +on Machine learning model and dataset versioning practices. + +We absolutely loved being at PyCon and can’t wait for our next conference! + +
+ +Our team is so happy every time we discover an article featuring DVC or +addressing one of the burning ML issues we are trying to solve. Here are some of +the links that caught our eye past month: + +- **[The Rise of DataOps (from the ashes of Data Governance)](https://towardsdatascience.com/the-rise-of-dataops-from-the-ashes-of-data-governance-da3e0c3ac2c4) + by [Ryan Gross](https://towardsdatascience.com/@ryanwgross).** + +A brilliant comprehensive read on the current data management issues. It might +be the best article we have ever read on this subject. Every word strongly +resonates with our vision and ideas behind DVC. Highly recommended by DVC team! + + + +> Legacy Data Governance is broken in the ML era. Let’s rebuild it as an +> engineering discipline. At the end of the transformation, data governance will +> look a lot more like DevOps, with data stewards, scientists, and engineers +> working closely together to codify the governance policies. + +- **[First Impressions of Data Science Version Control (DVC)](https://medium.com/@christopher.samiullah/first-impressions-of-data-science-version-control-dvc-fe96ab29cdda) + by [Christopher Samiullah](https://christophergs.github.io/)** + + + +> In 2019, we tend to find organizations using a mix of git, Makefiles, ad hoc +> scripts and reference files to try and achieve reproducibility. DVC enters +> this mix offering a cleaner solution, specifically targeting Data Science +> challenges. + +- **[Versioning and Reproducibility with MLV-tools and DVC](https://github.com/peopledoc/mlvtools-tutorial): + [Talk](https://peopledoc.github.io/mlvtools-tutorial/talks/pyData/presentation.html#/) + and + [Tutorial](https://peopledoc.github.io/mlvtools-tutorial/talks/workshop/presentation.html#/) + by [Stéphanie Bracaloni](https://github.com/sbracaloni) and + [Sarah Diot-Girard](https://github.com/SdgJlbl).** + +![](../uploads/images/2019-06-26/versioning-and-reproducibility-with-mlv-tools.png) + +- **[Becoming a machine learning company means investing in foundational technologies](https://www.oreilly.com/ideas/becoming-a-machine-learning-company-means-investing-in-foundational-technologies) + by [Ben Lorica](https://www.oreilly.com/people/4e7ad-ben-lorica)** + + + +> With an eye toward the growing importance of machine learning, we recently +> completed +> [a data infrastructure survey](https://www.oreilly.com/data/free/evolving-data-infrastructure.csp) +> that drew more than 3,200 respondents. + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: [Does DVC support Azure Data Lake Gen1?](https://discordapp.com/channels/485586884165107732/563406153334128681/575655655629651968) + +Azure data lake is HDFS compatible. And DVC supports HDFS remotes. Give it a try +and let us know if you hit any problems [here](https://dvc.org/chat). + +### Q: [An excellent discussion on versioning tabular (SQL) data.](https://discordapp.com/channels/485586884165107732/563406153334128681/575681811401801748) Do you know of any tools that deal better with SQL-specific versioning? + +It’s a wide topic. The actual solution might depend on a specific scenario and +what exactly needs to be versioned. DVC does not provide any special +functionality on top of databases to version their content. + +Depending on your use case, our recommendation would be to run SQL and pull the +result file (CSV/TSV file?) that then can be used to do analysis. This file can +be taken under DVC control. Alternatively, in certain cases source files (that +are used to populate the databases) can be taken under control and we can keep +versions of them, or track incoming updates. + +Read the +[discussion](https://discordapp.com/channels/485586884165107732/563406153334128681/575681811401801748) +to learn more. + +### Q: [How does DVC do the versioning between binary files?](https://discordapp.com/channels/485586884165107732/563406153334128681/575686711821205504) Is there a binary diff, similar to git? Or is every version stored distinctly in full? + +DVC is just saving every file as is, we don’t use binary diffs right now. There +won’t be a full directory (if you added just a few files to a 10M files +directory) duplication, though, since we treat every file inside as a separate +entity. + +### Q: [Is there a way to pass parameters from e.g. `dvc repro` to stages?](https://discordapp.com/channels/485586884165107732/563406153334128681/576160840701575169) + +The simplest option is to create a config file — json or whatnot — that your +scripts would read and your stages depend on. + +### Q: [What is the best way to get cached output files from different branches simultaneously?](https://discordapp.com/channels/485586884165107732/563406153334128681/577852740034625576) For example, cached tensorboard files from different branches to compare experiments. + +There is a way to do that through our (still not officially released) API pretty +easily. Here is an +[example script](https://cdn.discordapp.com/attachments/563406153334128681/577894682722304030/dvc_get_output_files.py) +how it could be done. + +### Q: [Docker and DVC.](https://discordapp.com/channels/485586884165107732/563406153334128681/583949033685516299) To being able to push/pull data we need to run a git clone to get DVC-files and remote definitions — but we worry that would make the container quite heavy (since it contains our entire project history). + +You can do `git clone — depth 1`, which will not download any history except the +latest commits. + +### Q: [After DVC pushing the same file, it creates multiple copies of the same file. Is that how it’s supposed to work?](https://discordapp.com/channels/485586884165107732/485596304961962003/574133734136086559) + +If you are pushing the same file, there are no copies pushed or saved in the +cache. DVC is using checksums to identify files, so if you add the same file +once again, it will detect that cache for it is already in the local cache and +wont copy it again to cache. Same with dvc push, if it sees that you already +have cache file with that checksum on your remote, it won’t upload it again. + +### Q: [How do I uninstall DVC on Mac (installed via `pkg` installer)?](https://discordapp.com/channels/485586884165107732/485596304961962003/574941227624169492) + +Something like this should work: + +```dvc +$ which dvc +/usr/local/bin/dvc -> /usr/local/lib/dvc/dvc + +$ ls -la /usr/local/bin/dvc +/usr/local/bin/dvc -> /usr/local/lib/dvc/dvc + +$ sudo rm -f /usr/local/bin/dvc +$ sudo rm -rf /usr/local/lib/dvc +$ sudo pkgutil --forget com.iterative.dvc +``` + +### Q: [How do I pull from a public S3 bucket (that contains DVC remote)?](https://discordapp.com/channels/485586884165107732/485596304961962003/575236576309674024) + +Just add public URL of the bucket as an HTTP endpoint. See +[here](https://github.com/iterative/example-get-started/blob/master/.dvc/config) +for an example. +[https://remote.dvc.org/get-started](https://remote.dvc.org/get-started) is made +to redirect to the S3 bucket anyone can read from. + +### Q: [I’m getting the same error over and over about locking:](https://discordapp.com/channels/485586884165107732/485596304961962003/575535709490905101) `ERROR: failed to lock before running a command — cannot perform the cmd since DVC is busy and locked. Please retry the command later.` + +Most likely it happens due to an attempt to run DVC on NFS that has some +configuration problems. There is a +[well known problem with DVC on NFS](https://github.com/iterative/dvc/issues/1918) +— sometimes it hangs on trying to lock a file. The usual workaround for this +problem is to allocate DVC cache on NFS, but run the project (git clone, DVC +metafiles, etc) on the local file system. Read +[this answer](https://discuss.dvc.org/t/share-nas-data-in-server/180/4?u=shcheklein) +to see how it can be setup. + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are open, too. diff --git a/content/blogs/2019-08-01-july-19-dvc-heartbeat.md b/content/blogs/2019-08-01-july-19-dvc-heartbeat.md new file mode 100644 index 0000000000..246053a312 --- /dev/null +++ b/content/blogs/2019-08-01-july-19-dvc-heartbeat.md @@ -0,0 +1,212 @@ +--- +title: July ’19 DVC❤️Heartbeat +date: 2019-08-01 +description: > + As we continue to grow DVC together with our fantastic contributors, we enjoy + more and more insights, discussions, and articles either created or brought to + us by our community. +descriptionLong: > + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: 2019-08-01/post-image.png +pictureComment: | + Special edition + [DVC shirt](https://twitter.com/rkuprieiev/status/1144298339200098306?s=20). + We made this one for [Ruslan](https://github.com/efiop) — DVC maintainer and + the best tech lead. +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/july-19-dvc-heartbeat/288 +tags: + - Heartbeat + - Open Source Summit + - Community +--- + +## News and links + +As we continue to grow DVC together with our fantastic contributors, we enjoy +more and more insights, discussions, and articles either created or brought to +us by our community. We feel it is the right time to start sharing more of your +news, your stories and your discoveries. New Heartbeat is here! + +Speaking of our own news — next month DVC team is going to the +[Open Source North America Summit](https://events.linuxfoundation.org/events/open-source-summit-north-america-2019/). +It is taking place in San Diego on August 21–23. +[Dmitry](https://ossna19.sched.com/speaker/dmitry35) and +[Sveta](https://ossna19.sched.com/speaker/svetlanagrinchenko) will be giving +talks and we will run a booth. So looking forward to it! Stop by for a chat and +some cool swag. And if you are in San Diego on those days and want to catch up — +please let us know [here](http://dvc.org/support) or on Twitter! + + + + + +Every month our team is excited to discover new great pieces of content +addressing some of the burning ML issues. Here are some of the links that caught +our eye in June: + +- **[Principled Machine Learning: Practices and Tools for Efficient Collaboration](https://dev.to/robogeek/principled-machine-learning-4eho) + by [David Herron](https://medium.com/@7genblogger)** + + + +> As we’ve seen in this article some tools and practices can be borrowed from +> regular software engineering. However, the needs of machine learning projects +> dictate tools that better fit the purpose. + +- **First + [ML-REPA](http://ml-repa.ru/)[Meetup: Reproducible ML experiments](http://ml-repa.ru/page6697700.html) + hosted by [Raiffeisen DGTL](https://dgtl.raiffeisen.ru/) check out the video + and slide decks.** + + + +[ML-REPA](http://ml-repa.ru/) is an a new fantastic resource for +Russian-speaking folks interested in Reproducibility, Experiments and Pipelines +Automation. Curated by [Mikhail Rozhkov](https://twitter.com/mnrozhkov) and +highly recommended by our team. + +### [How do you manage your machine learning experiments?](https://www.reddit.com/r/MachineLearning/comments/bx0apm/d_how_do_you_manage_your_machine_learning/) discussion on Reddit is full of insights. + +
[D] How do you manage your machine learning experiments? from r/MachineLearning
+ +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: I have within one git repository different folders with very different content (basically different projects, or content I want to have different permissions to), and I thought about using different buckets in AWS as remotes. [I’m not sure if it’s possible with DVC to store some files in some remote, and some other files in some other remote, is it?](https://discordapp.com/channels/485586884165107732/485596304961962003/575718048330416158) + +You can definitely add more than one remote (see +[dvc remote add](https://dvc.org/doc/commands-reference/remote/add)) and then +[dvc push](https://dvc.org/doc/commands-reference/push) has a `-R` option to +pick which one to send the cached data files (deps, outs, etc) to. We would not +recommend doing this though. It complicates the commands you have to run — you +will need to remember to specify a remote name for every command that deals with +data — `push`, `pull`, `gc`, `fetch`, `status`, etc. Please, leave a comment in +the relevant issue [here](https://github.com/iterative/dvc/issues/2095) if this +case is important for you. + +### Q: [Is that possible with DVC to have multiple (few) metric files and compare them all at once?](https://discordapp.com/channels/485586884165107732/485596304961962003/578532350221352987) For example, we’d like to consider as metrics the loss of a neural network training process (loss as a `-M` output of a training stage), and also apart knowing the accuracy of the NN on a test set (another `-M` output of eval stage). + +Yes, it is totally fine to use `-M` in different stages. `dvc metrics show` will +just show both metrics. + +### Q: [I have a scenario where an artifacts (data) folder is created by the dvc run command via the `-o` flag. I have manually added another file into or modified the artifacts folder but when I do `dvc push` nothing happens, is there anyway around this?](https://discordapp.com/channels/485586884165107732/485596304961962003/577362750443880449) + +Let’s first do a quick recap on how DVC handles data files (you can definitely +find more information on the [DVC documentation site](http://dvc.org/docs)). + +- When you do `dvc add`, `dvc run` or `dvc import` DVC puts artifacts (in case + of `dvc run` artifacts == outputs produced by the command) into `.dvc/cache` + directory (default cache location). You don’t see this happening because + [DVC keeps links](https://dvc.org/doc/user-guide/large-dataset-optimization) + (or in certain cases creates a copy) to these files/directories. + +- `dvc push` does not move files from the workspace (that what you see) to the + remote storage, it always moves files/directories that are already in cache + (default is .dvc/cache). + +- So, now you’ve added a file manually, or made some other modifications. But + these files are not in cache yet. The analogy would be `git commit`. You + change the file, you do `git commit`, only after that you can push something + to Git server (Github/Gitlab, etc). The difference is that DVC is doing commit + (moves files to cache) automatically in certain cases — `dvc add`, `dvc run`, + etc. + +There is an explicit command — `dvc commit` - that you should run if you want to +enforce the change to the output produced by `dvc run`. This command will update +the corresponding DVC- files (.dvc extension) and will move data to cache. After +that you should be able to run `dvc push` to save your data on the external +storage. + +Note, when you do an explicit commit like this you are potentially “breaking” +the reproducibility. In a sense that there is no guarantee now that your +directory can be produced by `dvc run`/`dvc repro` — since you changed it +manually. + +### Q: [I’d like to transform my dataset in-place to avoid copying it, but I can’t use `dvc run` to do this because it doesn’t allow the same directory as an output and a dependency.](https://discordapp.com/channels/485586884165107732/485596304961962003/578898899469729796) + +You could do this in one step (one stage). So that getting your data and +modifying it, is one stage. So you don’t depend on the data folder. You just +could depend on your download + modifying script. + +### Q: [Can anyone tell me what this error message is about?](https://discordapp.com/channels/485586884165107732/485596304961962003/579283950778712076) “To avoid unpredictable behavior, rerun command with non overlapping outs paths.” + +Most likely it means that there is a DVC-file that have the same output twice. +Or there two DVC-files that share the same output file. + +### Q: [I’m getting “No such file or directory” error when I do `dvc run` or `dvc repro`](https://discordapp.com/channels/485586884165107732/485596304961962003/580176327701823498). The command runs find if I don’t use DVC. + +That happens because dvc run is trying to ensure that your command is the one +creating your output and removes existing outputs before executing the command. +So that when you run `dvc repro` later, it will be able to fully reproduce the +output. So you need to make the script create the directory or file. + +### Q: [I’m implementing a CI/CD and I would like to simplify my CI/CD or even my training code (keeping them cloud agnostic) by using `dvc pull` inside my Docker container when initializing a training job. ](https://discordapp.com/channels/485586884165107732/485596304961962003/581256265234251776) Can DVC be used in this way? + +Yes, it’s definitely a valid case for DVC. There are different ways of +organizing the storage that training machines are using to access data. From the +very simple — using local storage volume and pulling data from the remote +storage every time — to using NAS or EFS to store a shared DVC cache. + +### Q: [I was able to follow the getting started examples, however now I am trying to push my data to Github, I keep getting the following error: “ERROR: failed to push data to the cloud — upload is not supported by https remote”.](https://discordapp.com/channels/485586884165107732/563406153334128681/598866528984891403) + +HTTP remotes do not support upload yet. Example Get Started repository is using +HTTP to keep it read-only and abstract the actual storage provider we are using +internally. If you actually check the remote URL, you should see that it is an +S3 bucket and AWS provides an HTTP end-point to read data from it. + +### Q: I’m looking to configure AWS S3 as a storage for DVC. I’ve set up the remotes and initialized dvc in the git repository. I tried testing it by pushing a dataset in the form of an excel file. The command completed without any issues but this is what I’m seeing in S3. [DVC seems to have created a subdirectory in the intended directory called “35” where it placed this file with a strange name.](https://discordapp.com/channels/485586884165107732/485596304961962003/585967551708921856) + +This is not an issue, it is an implementation detail. There’s no current way to +upload the files with the original filename (In this case, the S3 bucket will +have the file `data.csv` but with another name `20/893143…`). The reason behind +this decision is because we want to store a file only once no matter how many +dataset versions it’s used in. Also, it’s a reliable way to uniquely identify +the file. You don’t have to be afraid that someone decided to create a file with +the same name (path) but a different content. + +### Q: [Is it possible to only have a shared ‘local’ cache and no remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/587730054893666326) I’m trying to figure out how to use this in a 40 node cluster which already has very fast NFS storage across all the nodes. Not storing everything twice seems desirable. Esp. for the multi-TB input data + +Yes and it’s one of the very common use case, actually. All you need to do is to +use dvc cache dir command to setup an external cache. There are few caveats +though. Please, read +[this link](https://discuss.dvc.org/t/share-nas-data-in-server/180/4?u=shcheklein) +for an example of the workflow. + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too. diff --git a/content/blogs/2019-09-26-september-19-dvc-heartbeat.md b/content/blogs/2019-09-26-september-19-dvc-heartbeat.md new file mode 100644 index 0000000000..5b6d4fc030 --- /dev/null +++ b/content/blogs/2019-09-26-september-19-dvc-heartbeat.md @@ -0,0 +1,355 @@ +--- +title: September ’19 DVC❤️Heartbeat +date: 2019-09-26 +description: > + Announcing our first meetup in San Francisco, kicking off Google Season of + Docs program, sharing Open Source Summit experience, and more news, links, and + gems. +descriptionLong: > + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: 2019-09-26/post-image.jpeg +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/september-19-dvc-heartbeat/287 +tags: + - Community + - Heartbeat + - Meetup + - Open Source Summit +--- + +## News and links + +We are super excited to co-host our very first +**[meetup in San Francisco on October 10](https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/)**! +We will gather at the brand new Dropbox HQ office at 6:30 pm to discuss +open-source tools to version control ML models and experiments. +[Dmitry Petrov](https://twitter.com/FullStackML) is teaming up with +[Daniel Fischetti](https://www.linkedin.com/in/daniel-fischetti-4a6592bb/) from +[Standard Cognition](https://standard.ai/) to discuss best ML practices. Join us +and save your spot now: + + + +If you are not in SF on this date and happen to be in Europe — don’t miss the +PyCon DE & PyData Berlin 2019 joint event on October 9–11. We cannot make it to +Berlin this year, but we were thrilled to discover 2 independent talks featuring +DVC by +[Alessia Marcolini](https://pyvideo.org/pydata-berlin-2019/version-control-for-data-science.html) +and +[Katharina Rasch](https://pyvideo.org/pydata-berlin-2019/tools-that-help-you-get-your-experiments-under-control.html). + +Some other highlights of the end of summer: + +- Our users and contributors keep creating fantastic pieces of content around + DVC (sharing some links below, but it’s only a fraction of what we have in + stock — can’t be more happy and humbled about it!). + +- We’ve reached 79 contributors to + [DVC core project](https://github.com/iterative/dvc) and 74 contributors to + [DVC documentation](https://github.com/iterative/dvc.org) (and have something + special in mind to celebrate our 100th contributors). + +- We enjoyed working with all the talented + [Google Season of docs](https://developers.google.com/season-of-docs/) + applicants and now moving to the next stage with our chosen tech writer + [Dashamir Hoxha](http://dashohoxha.fs.al/). + +- We’ve crossed the 3,000 stars mark on Github + ([over 3,500 now](https://github.com/iterative/dvc)). Thank you for your + support! + + https://twitter.com/DVCorg/status/1147220439472545793 + +- We’ve had great time at the + [Open Source Summit](https://events.linuxfoundation.org/events/open-source-summit-north-america-2019/program/) + by Linux foundation in San Diego — speaking on stage, running a booth and + chatting with all the amazing open-source crowd out there. + + https://twitter.com/a142hr/status/1164256520235675648 + +![](../uploads/images/2019-09-26/open-source-summit-by-linux-foundation.jpeg) + +
+ +Here are some of the great pieces of content around DVC and ML ops that we +discovered in July and August: + +- ** Great insightful discussion on Twitter about versioning ML projects started + by [Nathan Benaich](https://medium.com/@NathanBenaich).** + + https://twitter.com/NathanBenaich/status/1151815916512010242 + +- **[Our Machine Learning Workflow: DVC, MLFlow and Training in Docker Containers](https://medium.com/ixorthink/our-machine-learning-workflow-dvc-mlflow-and-training-in-docker-containers-5b9c80cdf804) + by [Ward Van Laer](https://medium.com/@ward.vanlaer).** + +> It is possible to manage your work flow using open-source and free tools. + + + +- **[Using DVC to create an efficient version control system for data projects](https://medium.com/qonto-engineering/using-dvc-to-create-an-efficient-version-control-system-for-data-projects-96efd94355fe) + by [Basile Guerrapin](https://medium.com/@basile_16101).** + +> DVC brought versioning for inputs, intermediate files and algorithm models to +> the VAT auto-detection project and this drastically increased our +> **productivity**. + + + +- **[Managing versioned machine learning datasets in DVC, and easily share ML projects with colleagues](https://techsparx.com/software-development/ai/dvc/versioning-example.html) + by [David Herron](https://twitter.com/7genblogger).** + +> In this tutorial we will go over a simple image classifier. We will learn how +> DVC works in a machine learning project, how it optimizes reproducing results +> when the project is changed, and how to share the project with colleagues. + + + +- **[How to use data version control (dvc) in a machine learning project](https://towardsdatascience.com/how-to-use-data-version-control-dvc-in-a-machine-learning-project-a78245c0185) + by [Matthias Bitzer](https://towardsdatascience.com/@matthiasbitzer94).** + +> To illustrate the use of dvc in a machine learning context, we assume that our +> data is divided into train, test and validation folders by default, with the +> amount of data increasing over time either through an active learning cycle or +> by manually adding new data. + + + +- **[Version Control ML Model](https://towardsdatascience.com/version-control-ml-model-4adb2db5f87c) + by [Tianchen Wu](https://towardsdatascience.com/@TianchenW)** + +> This post presents a solution to version control machine learning models with +> git and dvc ([Data Version Control](https://dvc.org/doc/tutorial)). + + + +- **[Reflinks vs symlinks vs hard links, and how they can help machine learning projects](https://dev.to/robogeek/reflinks-vs-symlinks-vs-hard-links-and-how-they-can-help-machine-learning-projects-1cj4) + by [David Herron](https://medium.com/@7genblogger)** + +> In this blog post we’ll go over the details of using links, some cool new +> stuff in modern file systems (reflinks), and an example of how DVC (Data +> Version Control, [https://dvc.org/](https://dvc.org/)) leverages this. + + + +- **[DVC dependency management — a guide](https://blog.codecentric.de/en/2019/08/dvc-dependency-management/) + by [Bert Besser](https://blog.codecentric.de/en/author/bert-besser/) and + [Veronika Schwan](https://blog.codecentric.de/en/author/veronika-schindler/).** + +> This post is a follow-up to +> [A walkthrough of DVC](https://blog.codecentric.de/en/2019/03/walkthrough-dvc/) +> that deals with managing dependencies between DVC projects. In particular, +> this follow-up is about importing specific versions of an artifact (e.g. a +> trained model or a dataset) from one DVC project into another. + + + +- **[Effective ML Teams — Lessons Learne](https://medium.com/@czeslaw.szubert/effective-ml-teams-lessons-learned-6a6e761bc283) + by [Czeslaw Szubert](https://medium.com/@czeslaw.szubert)** + +> In this post I’ll present lessons learned on how to setup successful ML teams +> and what you need to devise an effective enterprise ML strategy. + + + +- **[Lessons learned from training a German Speech Recognition model](https://www.esentri.com/lessons-learned-from-training-a-german-speech-recognition-model/) + by [David Schönleber](https://www.linkedin.com/in/dschoenleber/).** + +> Setting up a documentation-by-design workflow and using appropriate tools +> where needed, e.g. _MLFlow_ and _dvc,_ can be a real deal-breaker. + + + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: I’m getting an error message while trying to use AWS S3 storage: `ERROR: failed to push data to the cloud — Unable to locate credentials.` [Any ideas what’s happening?](https://discordapp.com/channels/485586884165107732/563406153334128681/587792932061577218) + +Most likely you haven’t configured your S3 credentials/AWS account yet. Please, +read the full documentation on the AWS website. The short version of what should +be done is the following: + +- [Create your AWS account.](https://portal.aws.amazon.com/gp/aws/developer/registration/index.html) + +- Log in to your AWS Management Console. + +- Click on your user name at the top right of the page. + +- Click on the Security Credentials link from the drop-down menu. + +- Find the Access Credentials section, and copy the latest `Access Key ID`. + +- Click on the Show link in the same row, and copy the `Secret Access Key`. + +Follow +[this link](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) +to setup your environment. + +### Q: I added data with `dvc add` or `dvc run` and see that it takes twice what it was before (with `du` command). [Does it mean that DVC copies data that is added under its control? How do I prevent this from happening?](https://discordapp.com/channels/485586884165107732/563406153334128681/595402051203235861) + +To give a short summary — by default, DVC copies the files from your working +directory to the cache (this is for safety reasons, it is better to duplicate +the data). If you have reflinks (copy-on-write) enabled on your file system, DVC +will use that method — which is as safe as copying. You can also configure DVC +to use hardlinks/symlinks to save some space and time, but it will require +enabling the protected mode (making data files in workspace read-only). Read +more details [here](https://dvc.org/doc/user-guide/large-dataset-optimization). + +### Q: [How concurrent-friendly is the cache? And different remotes? Is it safe to have several containers/nodes fill the same cache at the same time?](https://discordapp.com/channels/485586884165107732/563406153334128681/599345778703597568) + +It is safe and a very common use case for DVC to have a shared cache. Please, +check [this thread](https://discuss.dvc.org/t/share-nas-data-in-server/180/12), +for example. + +### Q:[What is the proper way to exit the ASCII visualization?](https://discordapp.com/channels/485586884165107732/563406153334128681/603890677176336394) (when you run `dvc pipeline show` command). + +See this +[document](https://dvc.org/doc/commands-reference/pipeline/show#options). To +navigate, use arrows or W, A, S, D keys. To exit, press Q. + +### Q: [Is there an issue if I set my `cache.s3` external cache to my default remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/606197026488844338) I don’t quite understand what an external cache is for other than I have to have it for external outputs. + +Short answer is that we would suggest keeping them separately to avoid possible +checksum overlaps. Checksum on S3 might theoretically overlap with our checksums +(with the content of the file being different), so it could be dangerous. The +chances of losing data are pretty slim, but we would not risk it. Right now, we +are working on making sure there are no possible overlapping. + +### Q: [What’s the right procedure to move a step .dvc file around the project?](https://discordapp.com/channels/485586884165107732/563406153334128681/606425815139221504) + +Assuming the file was created with `dvc run`. There are few possible ways. +Obvious one is to delete the file and create a new one with +`dvc run --no-exec -f file/path/and/name.dvc`. Another possibility is to +rename/move and then edit manually. See +[this document](https://dvc.org/doc/user-guide/project-structure) that describes +how DVC-files are organized. No matter what method you use, you can run +`dvc commit file.dvc` to save changes without running the command again. + +### Q: [`dvc status` doesn’t seem to report things that need to be dvc pushed, is that by design?](https://discordapp.com/channels/485586884165107732/563406153334128681/606917839688957952) + +You should try with dvc status `--cloud` or `dvc status --remote ` +to compare your local cache with a remote one, by default it only compares the +“working directory” with your local cache (to check whether something should be +reproduced and saved or not). + +### Q: [What kind of files can you put into `dvc metrics`?](https://discordapp.com/channels/485586884165107732/563406153334128681/608701494035873792) + +The file could be in any format, `dvc metrics` show will try to interpret the +format and output it in the best possible way. Also, if you are using `csv` or +`json`, you can use the `--xpath` flag to query specific measurements. **In +general, you can make any file a metric file and put any content into it, DVC is +not opinionated about it.** Usually though these are files that measures the +performance/accuracy of your model and captures configuration of experiments. +The idea is to use `dvc metrics show` to display all your metrics across +experiments so you can make decisions of which combination (of features, +parameters, algorithms, architecture, etc.) works the best. + +### Q: [Does DVC take into account the timestamp of a file or is the MD5 only depends on the files actual/bits content?](https://discordapp.com/channels/485586884165107732/563406153334128681/613639458000207902) + +DVC takes into account only content (bits) of a file to calculate hashes that +are saved into DVC-files. + +### Q: [Similar to `dvc gc` is there a command to garbage collect from the remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/616421757808541721) + +`dvc gc --remote NAME` is doing this, but you should be extra careful, because +it will remove everything that is not currently “in use” (by the working +directory). Also, please check this +[issue](https://github.com/iterative/dvc/issues/2325) — semantics of this +command might have changed by the time you read this. + +### Q: [How do I use and configure remote storage on IBM Cloud Object Storage?](https://discordapp.com/channels/485586884165107732/485596304961962003/591237578209099786) + +Since it’s S3 compatible, specifying `endpointurl` (exact URL depends on the +[region](https://cloud.ibm.com/docs/services/cloud-object-storage?topic=cloud-object-storage-endpoints)) +is the way to go: + +```dvc +$ dvc remote add -d mybucket s3://path/to/dir +$ dvc remote modify mybucket \ + endpointurl \ + https://s3.eu.cloud-object-storage.appdomain.cloud +``` + +### Q: [How can I push data from client to google cloud bucket using DVC?](https://discordapp.com/channels/485586884165107732/485596304961962003/592958360903483403). Just want to know how can i set the credentials. + +You can do it by setting environment variable pointing to yours credentials +path, like: + +```dvc +$ export GOOGLE_APPLICATION_CREDENTIALS=path/to/credentials +``` + +It is also possible to set this variable via `dvc config`: + +```dvc +$ dvc remote modify myremote credentialpath /path/to/my/creds +``` + +where `myremote` is your remote name. + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too. diff --git a/content/blogs/2019-10-08-dvc-org-for-hacktoberfest-2019.md b/content/blogs/2019-10-08-dvc-org-for-hacktoberfest-2019.md new file mode 100644 index 0000000000..ce3c36d241 --- /dev/null +++ b/content/blogs/2019-10-08-dvc-org-for-hacktoberfest-2019.md @@ -0,0 +1,114 @@ +--- +title: DVC.org for Hacktoberfest 2019 +date: 2019-10-08 +description: > + Our favorite month of the year Hacktoberfest is already in full swing and we + at DVC.org are so excited to be a part of it! +descriptionLong: > + Our favorite month of the year + [Hacktoberfest](https://hacktoberfest.digitalocean.com/) is already in full + swing and we at [DVC.org](https://dvc.org) are so excited to be a part of it! +picture: 2019-10-08/post-image.png +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/dvc-org-for-hacktoberfest-2019/286 +tags: + - Hacktoberfest + - Company +--- + +[Hacktoberfest](https://hacktoberfest.digitalocean.com/) is a monthly-long +program that celebrates open source and encourages you to contribute to open +source projects (and rewards you with stickers and a cool T-shirt!). Whether +you’re a seasoned contributor or looking for projects to contribute to for the +first time, you’re welcome to participate! + +It is the 6th season of Hacktoberfest and the 2d year of participating for +DVC.org team. We really enjoyed it in 2018 and this year we are upping the game +with our own cool stickers, special edition T-shirts and a +[collection of carefully picked tickets](https://github.com/iterative/dvc/labels/hacktoberfest). + +### How to participate? + +If you haven’t started your Hacktoberfest challenge yet, it is just the right +time, you have 3 weeks left to submit PRs and get your swag! Here are some +important details: + +- Hacktoberfest is open to everyone in the global community. + +- You can sign up anytime between October 1 and October 31. Make sure to sign up + on the + [official Hacktoberfest website](https://hacktoberfest.digitalocean.com/) for + your PRs to count. + +- To get a shirt, you must make 4 legit pull requests (PRs) between October 1–31 + in any time zone. + +- Pull requests can be made in any public GitHub-hosted repositories/projects, + not just the ones highlighted. + +And the special addition from DVC.org team: + +- Look through the list of + [DVC Hacktoberfest tickets](https://github.com/iterative/dvc/labels/hacktoberfest) + or the list of + [good DVC first issues](https://github.com/iterative/dvc/labels/good%20first%20issue). + +- Make a PR to DVC and get our stickers. + +- Close three issues for DVC and get a special DVC T-shirt. + +### Why contribute to DVC? + +[DVC](http://dvc.org) (Data Version Control) is a relatively young open source +project. It was started in late 2017 by a data scientist and an engineer to fill +in the gaps in the ML processes tooling. Nowadays DVC is growing pretty fast and +though our in-house team is quite small, we have to thank our contributors (more +than 100 in both code and docs) for developing DVC with us. + +DVC is participating in Hacktoberfest for 2 years in a row to bring more people +into open source, to learn from them and to give back by sharing our own +experience. This year we decided to focus on a single important topic for us — +improving UI/UX. + +As our contributors and maintainers were sifting through the feature requests, +bugs, and improvements to create a good +[list of Hacktoberfest tickets](https://github.com/iterative/dvc/labels/hacktoberfest), +we noticed that UI/UX label on Github is popping up again and again. DVC is a +command line tool, and improving UI/UX in our case means making decisions on how +to name command options, where and when to use +[confirmation prompts](https://github.com/iterative/dvc/issues/2498) and/or +where abort execution, what exactly user would expect to see in the output, how +to test it later, etc. + +Why improving UI/UX appears to be so important for DVC at this stage? Perhaps +because the project is more mature now and we are ready to spend more time on +polishing it. Or maybe because it is still too-engineering focused and we used +to disregard/de-prioritize all this ‘fancy’ stuff. Or it is because we just lack +experience in creating good CLI UI/UX! + +One or another, those are great reasons to focus on improving UI (in a broader +sense than just GUI), improving docs, creating powerful consistent experience +for our users and increasing accessibility of DVC. + +That’s how +[Heroku’s CLI style guide](https://devcenter.heroku.com/articles/cli-style-guide) +starts: + +> Heroku CLI plugins should provide a clear user experience, targeted primarily +> for human readability and usability, which delights the user, while at the +> same time supporting advanced users and output formats. This article provides +> a clear direction for designing delightful CLI plugins. + +At DVC we are building user experience in line with these principles too, but we +also have our own challenges. And here we turn for help to the global open +source community and all the contributors out there. + +For all of us who have a heart for open source — let’s discuss, contribute, +learn, take the technologies forward and build something great together! + +Happy hacking! + +
+ +We are happy to hear from you [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too! diff --git a/content/blogs/2019-11-05-october-19-dvc-heartbeat.md b/content/blogs/2019-11-05-october-19-dvc-heartbeat.md new file mode 100644 index 0000000000..e050f421c5 --- /dev/null +++ b/content/blogs/2019-11-05-october-19-dvc-heartbeat.md @@ -0,0 +1,270 @@ +--- +title: October ’19 DVC❤️Heartbeat +date: 2019-11-05 +description: > + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. +descriptionLong: > + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: 2019-11-05/post-image.png +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/october-19-dvc-heartbeat/285 +tags: + - Meetup + - Heartbeat + - Hacktoberfest + - Community +--- + +## News and links + +Autumn is a great season for new beginnings and there is so much we love about +it this year. Here are some of the highlights: + +- Co-hosting our + [first ever meetup](https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/)! + Our [Dmitry Petrov](https://twitter.com/FullStackML) partnering with + [Dan Fischetti](https://www.linkedin.com/in/daniel-fischetti-4a6592bb/) from + [Standard Cognition](https://twitter.com/standardAI) to discuss Open-source + tools to version control Machine Learning models and experiments. The + recording is available now here. + + https://youtu.be/RHQXK7EC0jI + +- [Getting ready for the Hacktoberfest](https://blog.dataversioncontrol.com/dvc-org-for-hacktoberfest-2019-ce5320151a0c) + and having the whole team get together to pick up and label nice issues and be + ready to support the contributors. + +- Discovering some really cool blogposts, talks and tutorials from our users all + over the world: check + [this blogpost in French](https://blog.octo.com/mise-en-application-de-dvc-sur-un-projet-de-machine-learning/) + or + [this tutorial in German](https://jupyter-tutorial.readthedocs.io/de/latest/productive/dvc/)! + +- Having a great time working with a + [tech writer](https://github.com/dashohoxha) brought to us by the + [Google Season of Docs](https://developers.google.com/season-of-docs) program. + Check out these + [interactive tutorials](https://dvc.org/doc/tutorials/interactive) we’ve + created together. + +- Having hot internal discussion about Discord vs Slack support/community + channels. If you are on the edge like us, have a look at + [this discussion](https://internals.rust-lang.org/t/exploring-new-communication-channels/7859) + in the Rust community, so helpful. + +- Seeing [Dmitry Petrov](https://twitter.com/FullStackML) being really happy one + day: + + https://twitter.com/FullStackML/status/1169403554290814976 + +
+ +We at [DVC.org](https://dvc.org) are so happy every time we discover an article +featuring DVC or addressing one of the burning ML issues we are trying to solve. +Here are some of the links that caught our eye past month: + +- **Continuous Delivery for Machine Learning by + [Danilo Sato](https://twitter.com/dtsato), + [Arif Wider](https://twitter.com/arifwider), + [Christoph Windheuser](https://twitter.com/intellification) and curated by + [Martin Fowler](https://martinfowler.com/).** + +> As Machine Learning techniques continue to evolve and perform more complex +> tasks, so is evolving our knowledge of how to manage and deliver such +> applications to production. By bringing and extending the principles and +> practices from Continuous Delivery, we can better manage the risks of +> releasing changes to Machine Learning applications in a safe and reliable way. + + + +- **[The Path to Identity Validation](https://medium.com/signaturit-tech-blog/the-path-to-identity-validation-2-3-4f698b2ffae9) + by [Víctor Segura](https://medium.com/@victor.segura).** + +> So, the first question is clear: how to choose the optimal hardware for neural +> networks? Secondly, assuming that we have the appropriate infrastructure, how +> to build the machine learning ecosystem to train our models efficiently and +> not die trying? At **Signaturit**, we have the solution ;) + + + +- **Talk: + [Managing Big Data in Machine Learning projects](https://pretalx.com/pyconuk-2019/talk/GCLBFH/) + by [V Vishnu Anirudh](https://twitter.com/vvasworld) at the + [Pycon UK 2019.](https://2019.pyconuk.org/)** + +> My talk will focus on Version Control Systems (VCS) for big-data projects. +> With the advent of Machine Learning (ML) , the development teams find it +> increasingly difficult to manage and collaborate on projects that deal with +> huge amounts of data and ML models apart from just source code. + +https://youtu.be/4XpHk85_x0E + +- **Podcast: TWIML Talk #295 + [Managing Deep Learning Experiments](https://twimlai.com/twiml-talk-295-managing-deep-learning-experiments-with-lukas-biewald/) + with [Lukas Biewald](https://twitter.com/l2k)** + +> Seeing a need for reproducibility in deep learning experiments, Lukas founded +> Weights & Biases. In this episode we discuss his experiment tracking tool, how +> it works, the components that make it unique in the ML marketplace and the +> open, collaborative culture that Lukas promotes. Listen to Lukas delve into +> how he got his start in deep learning experiments, what his experiment +> tracking used to look like, the current Weights & Biases business success +> strategy, and what his team is working on today. + + + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: I’ve just run a `dvc run` step, and realised I forgot to declare an output file. [Is there a way to add an output file without rerunning the (computationally expensive) step/stage?](https://discordapp.com/channels/485586884165107732/485596304961962003/593743448020877323) + +If you’ve already ran it, you could just open created DVC-file with an editor +and add an entry to the outs field. After that, just run `dvc commit my.dvc` and +it will save the checksums and data without re-running your command. +`dvc run --no-exec` would also work with commit instead of modifying the +DVC-file by hand. + +### Q: [For metric files do I have to use dvc run to set a metric or can I do it some other way?](https://discordapp.com/channels/485586884165107732/485596304961962003/593869598651318282) Can I use metrics functionality without the need to setup and manage DVC cache and remote storage? + +Any file that is under DVC control (e.g. added with `dvc add` or an output in +`dvc run -o`) can be made a metric file with dvc metrics add file. Alternatively +a command `dvc run -M` file makes file a metric without caching it. It means dvc +metrics show can be used while file is still versioned by Git. + +### Q: [Is there a way not to add the full (Azure) connection string to the .dvc/config file that is being checked into Git for using dvc remotes](https://discordapp.com/channels/485586884165107732/485596304961962003/595586670498283520)? I think it’s quite unhealthy to have secrets checked in SCM. + +There are two options — use `AZURE_STORAGE_CONNECTION_STRING` environment +variable or use `--local` flag that will put it into the `.dvc/config.local` +that is added to the `.gitignore`, so you don’t track it with it and so won’t +expose secrets. + +### Q: [I would like to know if it is possible to manage files under DVC whilst keeping them in their original locations (e.g. on a network drive in a given folder structure)](https://discordapp.com/channels/485586884165107732/485596304961962003/601068667131920385)? [If I want to add a large file to be tracked by DVC, and it is in a bucket on S3 or GCS, can I do that without downloading it locally?](https://discordapp.com/channels/485586884165107732/485596304961962003/615278138896941101) + +Yes, you are probably looking for external dependencies and outputs. This is the +[link](https://dvc.org/doc/user-guide/managing-external-data) to the +documentation to start. + +### Q: [How do I setup DVC so that NAS (e.g. Synology) acts as a shared DVC cache?](https://discordapp.com/channels/485586884165107732/485596304961962003/606388040377565215) + +Using NAS (e.g. NFS) is a very common scenario for DVC. In short you use +`dvc cache dir` to setup a cache externally. Set cache type to use symlinks and +enable protected mode. We are preparing a +[document](https://github.com/iterative/dvc.org/blob/31c5d424c6530bb793af69c2af578d2b8a374d02/static/docs/use-cases/shared-storage-on-nfs.md) +how to setup the NFS as a shared cache, but I think it can be applied to any +NAS. + +### Q: So I have some data that is in the hundreds of gigs. [If I enable symlink, hardlink strategy and cache protecting, will DVC automatically choose this strategy over copying when trying to use dvc add](https://discordapp.com/channels/485586884165107732/485596304961962003/608013531010301952)? + +Yes, it will! Here is some clarification. So when you set those settings like +that, `dvc add` data will move data to your cache and then will create a +hardlink from your cache to your workspace. + +Unless your cache directory and your workspace are on different file systems, +move should be instant. Please, find more information +[here](https://dvc.org/doc/user-guide/large-dataset-optimization). + +### Q: My repo’s DVC is “busy and locked” and I’m not sure how it got that way and how to remove/diagnose the lock. [Any suggestions?](https://discordapp.com/channels/485586884165107732/485596304961962003/608392956679815168) + +DVC uses a lock file to prevent running two commands at the same time. The lock +[file](https://dvc.org/doc/user-guide/dvc-internals) is under the `.dvc` +directory. If no DVC commands running and you are still getting this error it’s +safe to remove this file manually to resolve the issue. + +### Q: [I’m trying to understand how does DVC remote add work in case of a local folder and what is the best workflow when data is outside of your project root?](https://discordapp.com/channels/485586884165107732/485596304961962003/611209851757920266) + +When using DVC, in most cases we assume that your data will be somewhere under +project root. There is an option to use so called +[external dependencies](https://dvc.org/doc/user-guide/managing-external-data), +which is data that is usually too big to be stored under your project root, but +if you operate on data that is of some reasonable size, I would recommend +starting with putting data somewhere under project root. Remotes are usually +places where you store your data, but it is DVC task to move your data around. +But if you want to keep your current setup where you will have data in different +place than your project, you will need to refer to data with full paths. So, for +example: + +1. You are in `/home/gabriel/myproject` and you have initialized dvc and git + repository + +2. You have `featurize.py` in your project dir, and want to use data to produce + some features and than `train.py` to train a model. + +3. Run the command: + +```dvc +$ dvc run -d /research_data/myproject/videos \ + -o /research_data/myproject/features \ + python featurize.py +``` + +to tell DVC, that you use `/research_data/myproject/videos` to featurize, and +produce output to your features dir. Note that your code should be aware of +those paths, they can be hardcoded inside `featurize.py`, but point of `dvc run` +is just to tell DVC what artifacts belong to currently defined step of ML +pipeline. + +### Q: When I run `du` command to check how much space DVC project consumes I see that it duplicates/copies data. [It’s very space and time consuming to copy large data files, is there a way to avoid that?](https://discordapp.com/channels/485586884165107732/485596304961962003/613935477896249364) It takes too long to add large files to DVC. + +Yes! You don’t have to copy files with DVC. First of all, there are two reasons +when du can show that it takes double the space to store data under DVC control. +du can be inaccurate when the underlying file system supports reflinks (XFS on +Linux, APFS on Mac, etc). This is actually the best scenario since no copying is +happening and no changes are required to any DVC settings. Second, case means +that copy semantics is used by default. It can be turned off by providing cache +type `symlinks`, `hardlinks`. Please, read more on this +[here](https://dvc.org/doc/user-guide/large-dataset-optimization#file-link-types-for-the-dvc-cache). + +### Q: [How can I detach a file from DVC control?](https://discordapp.com/channels/485586884165107732/485596304961962003/615479227189559323) + +Just removing the corresponding DVC-file and running `dvc gc` after that should +be enough. It’ll stop tracking the data file and clean the local cache that +might still contain it. Note! Don’t forget to run `dvc unprotect` if you use +advanced[ DVC setup with symlinks and hardlinks](https://dvc.org/doc/user-guide/large-dataset-optimization) +(`cache.type` config option is not default). If `dvc gc` behavior is not +granular enough you can manually find the by its cache from the DVC-file in +`.dvc/cache` and remote storage. Learn +[here](https://dvc.org/doc/user-guide/dvc-internals#structure-of-cache-directory) +how they are organized. + +### Q: [I’m trying to understand if DVC is an appropriate solution for storing data under GDPR requirements.](https://discordapp.com/channels/485586884165107732/485596304961962003/621057268145848340) That means that permanent deletion of files with sensitive data needs to be fully supported. + +Yes, in this sense DVC is not very different from using bare S3, SSH or any +other storage where you can go and just delete data. DVC can give a bit of +overhead to locate a specific file to delete, but otherwise it’s all the same +you will be able to delete any file you want. See more details on how you +retrospectively can edit directories under DVC control +[here](https://discordapp.com/channels/485586884165107732/485596304961962003/621062105524862987). + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too. diff --git a/content/blogs/2019-12-14-november-19-dvc-heartbeat.md b/content/blogs/2019-12-14-november-19-dvc-heartbeat.md new file mode 100644 index 0000000000..75f19c0e45 --- /dev/null +++ b/content/blogs/2019-12-14-november-19-dvc-heartbeat.md @@ -0,0 +1,278 @@ +--- +title: November ’19 DVC❤️Heartbeat +date: 2019-12-14 +description: > + Co-hosting our first ever meetup, sharing our Hacktoberfest experience, 4K ⭐, + fresh Discord gems and other news. +descriptionLong: > + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: 2019-12-14/post-image.jpeg +pictureComment: + How cool is this handmade swag from our community? We were in tears! +author: svetlana_grinchenko +commentsUrl: https://discuss.dvc.org/t/november-19-dvc-heartbeat/284 +tags: + - Meetup + - Heartbeat + - Hacktoberfest + - Community +--- + +The past few months have been so busy and full of great events! We love how +involved our community is and can’t wait to share more with you: + +- We have organized our very first + [meetup](https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/)! + So many great conversations, new use cases and insights! Many thanks to + [Dan Fischetti](https://www.linkedin.com/in/daniel-fischetti-4a6592bb/) from + [Standard Cognition](https://standard.ai/), who joined our Dmitry Petrov on + stage. Watch the recording here. + + https://youtu.be/RHQXK7EC0jI + +- [Hacktoberfest](https://blog.dataversioncontrol.com/dvc-org-for-hacktoberfest-2019-ce5320151a0c) + was a great exercise for DVC team on many levels and we really enjoyed + supporting new contributors. Kudos to + [Nabanita Dash](https://twitter.com/explorer_07) for organizing a cool + DVC-themed hackathon! + + https://twitter.com/psociiit/status/1185150096792535040 + +- We’ve crossed 4k stars mark on [Github](https://github.com/iterative/dvc)! + +- DVC was participating in the + [Devsprints](https://twitter.com/FossMec/status/1192866498324254720) (Thank + you [Kurian Benoy](https://twitter.com/kurianbenoy2) for the intro!) and we + were happy to jump in and help with some mentoring. + + https://twitter.com/FossMec/status/1192866498324254720 + +![](../uploads/images/2019-12-14/devsprints.png)_Devsprints participants on our +[Discord](http://dvc.org/chat) channel_ + +- DVC became part of the default + [Homebrew formulae](https://formulae.brew.sh/formula/dvc)! So now you can + install it as easy as `brew install dvc`! + +- We helped 2 aspiring speakers deliver their very first conference talks. + [Kurian Benoy](https://twitter.com/kurianbenoy2/status/1183427495342694401?s=20) + was speaking at [PyconIndia](https://in.pycon.org/2019/) and + [Aman Sharma](https://www.linkedin.com/in/aman-sharma606/) was speaking at + [SciPyIndia](https://scipy.in/2019#speakers). **Supporting speakers is + something we are passionate about and if you ever wanted to give a talk on a + DVC-related topic — we are here to help, just + [let us know](https://dvc.org/support)!** + + https://youtu.be/Ipzf6oQqQpo + +- Our own [Dmitry Petrov](https://twitter.com/FullStackML) went to Europe to + speak at the + [Open Source Summit Europe](https://osseu19.sched.com/speaker/dmitry35) in + Lyon, [Highload++](https://www.highload.ru/moscow/2019/abstracts/6032) in + Moscow and made a stop in in Berlin to co-host a + [meetup](https://www.meetup.com/codecentric-Berlin/events/265555810/) with our + favourite AI folks from [Codecentric](https://www.codecentric.de/)! + +
+ +Here are some of the great pieces of content around DVC and ML ops that we +discovered in October and November: + +- **[Deploy Machine Learning Models with Django](https://www.deploymachinelearning.com/) + by Piotr Płoński.** + +> …building your ML system has a great advantage — it is tailored to your needs. +> It has all features that are needed in your ML system and can be as complex as +> you wish. This tutorial is for readers who are familiar with ML and would like +> to learn how to build ML web services. + + + +- **[How to Manage Your Machine Learning Workflow with DVC, Weights & Biases, and Docker](https://towardsdatascience.com/how-to-manage-your-machine-learning-workflow-with-dvc-weights-biases-and-docker-5529ea4e59e0) + by [James Le](https://le-james94.medium.com).** + +> In this article, I want to show 3 powerful tools to simplify and scale up +> machine learning development within an organization by making it easy to +> track, reproduce, manage, and deploy models. + + + +- **[Creating a solid Data Science development environment](https://towardsdatascience.com/creating-a-solid-data-science-development-environment-60df14ce3a34) + by + [Gabriel dos Santos Goncalves](https://towardsdatascience.com/@gabrielsgoncalves)** + +> We do believe that Data Science is a field that can become even more mature by +> using best practices in project development and that Conda, Git, DVC, and +> JupyterLab are key components of this new approach + + + +- **[Creating reproducible data science workflows with DVC](https://medium.com/y-data-stories/creating-reproducible-data-science-workflows-with-dvc-3bf058e9797b) + by [Gleb Ivashkevich](https://medium.com/@glib.ivashkevych).** + +> DVC is a powerful tool and we covered only the fundamentals of it. + + + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: When you do a `dvc import` you get the state of the data in the original repo at that moment in time from that repo, right? [The overall state of that repo (e.g. Git `commit id` (hash)) is not preserved upon import, right?](https://discordapp.com/channels/485586884165107732/563406153334128681/618744949277458462) + +On the contrary, DVC relies on Git `commit id` (hash) to determine the state of +the data as well as code. Git `commit id` (hash) is saved in DVC file upon +import, data itself is copied/downloaded into DVC repo cache but would not be +pushed to the remote — DVC does not create duplicates. There is a command to +advance/update it when it’s needed — `dvc update`. Git commit hash saved to +provide reproducibility. Even if the source repo `HEAD` has changed your import +stays the same until you run `dvc update` or redo `dvc import`. + +### Q: I’m trying to understand if DVC is an appropriate solution for storing data under GDPR requirements. [That means that permanent deletion of files with sensitive data needs to be fully supported.](https://discordapp.com/channels/485586884165107732/485596304961962003/621057268145848340) + +Yes, in this sense DVC is not very different from using bare S3, SSH or any +other storage where you can go and just delete data. DVC can give a bit of +overhead to locate a specific file to delete, but otherwise it’s all the same +you will be able to delete any file you want. Read more details in +[this discussion](https://discordapp.com/channels/485586884165107732/485596304961962003/621062105524862987). + +### Q: [Is there anyway to get the remote url for specific DVC-files?](https://discordapp.com/channels/485586884165107732/485596304961962003/621591769766821888) Say, I have a DVC-file `foo.png.dvc` — is there a command that will show the remote url, something like `dvc get-remote-url foo.png.dvc` which will return e.g. the Azure url to download. + +There is no special command for that, but if you are using Python, you could use +our API specifically designed for that: + +```python +from dvc.api import get_url + +url = get_url(path, + repo="https://github.com/user/proj", + rev="mybranch") +``` + +so, you could as well use this from CLI as a wrapper command. + +### Q: [Can DVC be integrated with MS Active Directory (AD) authentication for controlling access?](https://discordapp.com/channels/485586884165107732/563406153334128681/619244714071425035) The GDPR requirements would force me to use such a system to manage access. + +Short answer: no (as of the date of publishing this Heartbeat issue) Good news — +it should be very easy to add, so we would welcome a contribution :) Azure has a +connection argument for AD — quick googling shows this +[library](https://github.com/AzureAD/azure-activedirectory-library-for-python), +which is what probably needed. + +### Q: [How do I uninstall DVC from Mac installed as a package?](https://discordapp.com/channels/485586884165107732/485596304961962003/625124341201502209) + +When installing using `plain.pkg` it is a bit tricky to uninstall, so we usually +recommend using things like brew cask instead if you really need the binary +package. Try to run these commands: + +```dvc +$ sudo rm -rf /usr/local/bin/dvc +$ sudo rm -rf /usr/local/lib/dvc +$ sudo pkgutil --forget com.iterative.dvc +``` + +to uninstall the package. + +### Q: We are using SSH remote to store data, but the problem is that everyone within the project has different username on the remote machine and thus we cannot set it in the config file (that is committed to Git). [Is there a way to add just host and path, without the username?](https://discordapp.com/channels/485586884165107732/563406153334128681/619420070111608848) + +Yes, you should use `--local` or `--global` config options to set user per +project or per use machine without sharing (committing) them to Git: + +```dvc +$ dvc remote modify myremote —local user myuser +``` + +or + +```dvc +$ dvc remote modify myremote —global user myuser +``` + +### Q: [I still get the `SSL ERROR` when I try to perform a dvc push with or without `use_ssl = false`](https://discordapp.com/channels/485586884165107732/485596304961962003/628227197592797191)? + +A simple environment variable like this: + +```dvc +$ export AWS_CA_BUNDLE=/path/to/cert/cert.crt dvc push +``` + +should do the trick for now, we plan to fix the ca_bundle option soon. + +### Q: I have just finished a lengthy `dvc repro` and I’m happy with the result. However, I realized that I didn’t specify a dependency which I needed (and obviously is used in the computation). [Can I somehow fix it?](https://discordapp.com/channels/485586884165107732/563406153334128681/620572187841265675) + +Add the dependency to the stage file without rerunning/reproducing the stage. +This is not needed as this additional dependency hasn’t changed. + +You would need to edit the DVC-file. In the deps section add: + +```yaml +-path: not/included/file/path +``` + +and run `dvc commit file.dvc` to save changes w/o running the pipeline again. +See an example +[here](https://discordapp.com/channels/485586884165107732/563406153334128681/620641530075414570). + +### Q: For some reason [we need to always specify the remote name when doing a `dvc push`](https://discordapp.com/channels/485586884165107732/485596304961962003/629704961868955648) e.g., `dvc push -r upstream` as opposed to `dvc push` (mind no additional arguments). + +You can mark a “default” remote: + +```dvc +$ dvc remote add -d remote /path/to/my/main/remote +``` + +then, `dvc push` (and other commands like `dvc pull`) will know to push to the +default + +### Q: [If I want stage B to run after stage A, but the stage A has no output, can I specify A’s DVC-file as B’s dependency?](https://discordapp.com/channels/485586884165107732/563406153334128681/620715145374466048) + +No, at least at the time of publishing this. You could use a phony output +though. E.g. make the stage A output some dummy file and make B depend on it. +Please, consider creating or upvoting a relevant issue on our Github if you’d +this to be implemented. + +### Q: I’m just getting started with DVC, but I’d like to use it for multiple developers to access the data and share models and code. [I do own the server, but I’m not sure how to use DVC with SSH remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/598867829785362452) + +Please, refer to +[this answer](https://discuss.dvc.org/t/how-do-i-use-dvc-with-ssh-remote/279/2) +on the DVC forum and check the documentation for the +[`dvc remote add`](https://dvc.org/doc/command-reference/remote/add) and +[`dvc remote modify`](https://dvc.org/doc/command-reference/remote/modify) +commands to see more options and details. + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too. diff --git a/content/blogs/2020-01-17-january-20-dvc-heartbeat.md b/content/blogs/2020-01-17-january-20-dvc-heartbeat.md new file mode 100644 index 0000000000..ad74dba25f --- /dev/null +++ b/content/blogs/2020-01-17-january-20-dvc-heartbeat.md @@ -0,0 +1,145 @@ +--- +title: January ’20 DVC❤️Heartbeat +date: 2020-01-17 +description: > + Reaching 100 contributors, PyData LA, and more news from the DVC community. +descriptionLong: > + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. Some of those are related to our brainchild + [DVC](https://dvc.org) and its journey. The others are a collection of + exciting stories and ideas centered around ML best practices and workflow. +picture: 2020-01-17/DVC_chalk_donuts.png +pictureComment: We spread the joys of version control and donuts at PyData LA. +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/january-20-dvc-heartbeat/314 +tags: + - Heartbeat + - PyData +--- + +Welcome to the New Year! Time for a recap of the last few weeks of activity in +the DVC community. + +## News + +We were honored to be named a [Project of the Year](https://ods.ai/awards/2019/) +by Open Data Science, Russia's largest community of data scientists and machine +learning practitioners. Check out our ⭐️incredibly shiny trophy⭐️! + +https://twitter.com/DVCorg/status/1209544709930016768 + +DVC hit **100 individual contributors** on Github! To celebrate our +100th contributor, [Vera Sativa](https://github.com/verasativa/), we +sent her \$500 to use on any educational opportunity and her own DeeVee (that's +our rainbow owl). We also awarded educational mini-grants to two of DVC's +biggest contributors, [Vít Novotný](https://github.com/witiko), and +[David Příhoda](https://twitter.com/david_prihoda). + +![](../uploads/images/2020-01-17/odd_with_deevee.png)_Vera (center, flashing a +peace sign) thanked us with this lovely picture of DeeVee and her team, +[Odd Industries](https://odd.co). They are making some extremely neat tools for +construction teams using computer vision._ + +**We were at PyData LA!** Our fearless leader +[Dmitry gave a talk](https://www.youtube.com/watch?v=7Wsd6V0k4Oc) and we set up +a busy booth to meet with the Pythonistas of Los Angeles. It was a cold and +blustery day, but visitors kept showing up to our semi-outdoor booth. We're sure +they came for the open source version control and not the donuts. + +![](../uploads/images/2020-01-17/py_data1.jpeg) +![](../uploads/images/2020-01-17/py_data2.jpeg) _The DVC team and PyData +volunteers who heroically staffed our booth in the rain._ + +Our engineer and technical writer Jorge reported: + +> We were super happy to meet all kinds of data professionals and enthusiasts in +> several fields who are learning and adopting DVC with their teams – including +> several working with privacy-sensitive medical records, very cool! + +
+ +## From the community + +Here are some rumblings from the machine learning (ML) and data science +community that got us talking. + +**A machine learning software wishlist.** Computer scientist and writer +[Chip Huyen](https://twitter.com/chipro) tweeted about her ML software wishlist +and kicked off a big community discussion. + +https://twitter.com/chipro/status/1202815757593108480 + +Her tweet resonated with a lot of practitioners, who were eager to discuss the +solutions they'd tried. Among the many thoughtful replies and recommendations, +we were thrilled to see DVC mentioned. + +https://twitter.com/kristijan_ivanc/status/1202879739716870144 + +If you haven't already, definitely check out Chip's +[thread](https://twitter.com/chipro/status/1202815757593108480), and follow her +on Twitter for more excllent, accessible content about ML engineering. We're +thinking hard about these ideas and hope the discussion continues on- and +offline. + +**A gentle intro to DVC for data scientists.** Scientist +[Elle O'Brien](https://twitter.com/andronovhopf) published a code walkthrough +about using DVC to make an image classification project more reproducible. +Specifically, the blog is a case study about version control when a dataset +grows over time. If you're looking for a DVC tutorial geared for data +scientists, this might be up your alley. + + + +**Ideas for data scientists to level up their code** Machine learning engineer +Andrew Greatorex posted a blog called “Down with technical debt! Clean Python +for data scientists.” Andrew highlights something we can easily relate to: the +“science” part of data science, which encourages experimentation and +flexibility, sometimes means less emphasis on readable, shareable code. Andrew +writes: + +> "I’m hoping to shed light on some of the ways that more fledgling data +> scientists can write cleaner Python code and better structure small scale +> projects, with the important side effect of reducing the amount of technical +> debt you inadvertently burden on yourself and your team.” + +In this blog, DVC gets a shout-out as Andrew’s preferred data versioning tool, +used in conjunction with Git for versioning Python code. Thanks! + + + +**An introduction to MLOps** Engineer +[Sharif Elfouly](https://twitter.com/elfouly_sharif) wrote an approachable guide +to thinking about MLOps, the growing field around making ML projects run +efficiently from experimentation to production. He summarises why managing ML +projects can be fundamentally different than traditional software development: + +> “The main difference between traditional software and ML is that you don’t +> only have the code. You also have data, models, and experiments. Writing +> traditional software is relatively straightforward but in ML you need to try +> out a lot of different things to find the best and fastest model for your +> use-case. You have a lot of different model types to choose from and every +> single one of them has its specific hyperparameters. Even if you work alone +> this can get out of hand pretty quickly.” + +Sharif gives some recommendations for tools that work especially well for ML, +and he writes that DVC is the “perfect combination for versioning your code and +data.” Thanks, Sharif! We think you’re perfect, too. + + + +That's a wrap for January. We'll see you next month with more updates! diff --git a/content/blogs/2020-01-20-january-20-community-gems.md b/content/blogs/2020-01-20-january-20-community-gems.md new file mode 100644 index 0000000000..cab81a49d4 --- /dev/null +++ b/content/blogs/2020-01-20-january-20-community-gems.md @@ -0,0 +1,150 @@ +--- +title: January '20 Community Gems +date: 2020-01-20 +description: > + Great discussions and technical Q&A's from our users. +descriptionLong: > + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. Some of those are related to our brainchild + [DVC](https://dvc.org) and its journey. The others are a collection of + exciting stories and ideas centered around ML best practices and workflow. +picture: 2020-01-20/Community_Gems.png +pictureComment: +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/january-20-community-gems/315 +tags: + - Community Gems +--- + +## Discord gems + +There's a lot of action in our Discord channel these days. Ruslan, DVC's core +maintainer, said it best with a gif. + +https://twitter.com/rkuprieiev/status/1144008869414342658?ref_src=twsrc%5Etfw + +It's a lot to keep up with, so here are some highlights. We think these are +useful, good-to-know, and interesting conversations between DVC developers and +users. + +### Q: [What pros does DVC have compared to Git LFS?](https://discordapp.com/channels/485586884165107732/563406153334128681/657590900754612284) + +For an in-depth answer, check out this +[Stack Overflow discussion](https://stackoverflow.com/questions/58541260/difference-between-git-lfs-and-dvc). +But in brief, with DVC you don't need a special server, and you can use nearly +any kind of storage (S3, Google Cloud Storage, Azure Blobs, your own server, +etc.) without a fuss. There are also no limits on the size of the data that you +can store, unlike with GitHub. With Git LFS, there are some general LFS server +limits, too. DVC has additional features for sharing your data (e.g., +`dvc import`) and has pipeline support, so it does much more than LFS. Plus, we +have flexible and quick checkouts, as we utilize different link types (reflinks, +symlinks, and hardlinks). We think there are lots of advantages; of course, the +usefulness will depend on your particular needs. + +### Q: [How do I use DVC with SSH remote storage?](https://discordapp.com/channels/485586884165107732/563406153334128681/656016145119182849) I usually connect with a .pem key file. How do I do the same with DVC? + +DVC is built to work with the SSH protocol to access remote storage (we provide +some +[examples in our official documentation](https://dvc.org/doc/user-guide/external-dependencies#ssh)). +When SSH requires a key file, try this: + +```dvc +$ dvc remote modify myremote keyfile +``` + +### Q: [If you train a TensorFlow model that creates multiple checkpoint files, how do you establish them as dependencies in the DVC pipeline?](https://discordapp.com/channels/485586884165107732/563406153334128681/651098762466426891) + +You can specify a directory as a dependency/output in your DVC pipeline, and +store checkpointed models in that directory. It might look like this: + +```dvc +$ dvc run \ + -f train.dvc \ + -d data \ + -d train.py \ + -o models python code/train.py +``` + +where `models` is a directory created for checkpoint files. If you would like to +preserve your models in the data directory, though, then you would need to +specify them one by one. You can do this with bash: + +```dvc +$ dvc run $(for file in data/*.gz; do echo -n -d $file; done) +``` + +Be careful, though: if you declare checkpoint files to be an output of the DVC +pipeline, you won’t be able to re-run the pipeline using those checkpoint files +to initialize weights for model training. This would introduce circularity, as +your output would become your input. + +Also keep in mind that whenever you re-run a pipeline with `dvc repro`, outputs +are deleted and then regenerated. If you don't wish to automatically delete +outputs, there is a `--persist` flag (see discussion +[here](https://github.com/iterative/dvc/issues/1214) and +[here](https://github.com/iterative/dvc/issues/1884)), although we don't +currently provide technical support for it. + +Finally, remember that setting something as a dependency (`-d`) doesn't mean it +is automatically tracked by DVC. So remember to `dvc add` data files in the +beginning! + +### Q: [Is it possible to use the same cache directory for multiple DVC repos that are used in parallel?](https://discordapp.com/channels/485586884165107732/485596304961962003/655012135973158942) Or do I need external software to prevent potential race conditions? + +This is absolutely possible, and you don't need any external software to safely +use multiple DVC repos in parallel. With DVC, cache operations are atomic. The +only exception is cleaning the cache with `dvc gc`, which you should only run +when no one else is working on a shared project that is referenced in your cache +(and also, be sure to use the `--projects` flag +[as described in our docs](https://dvc.org/doc/command-reference/gc)). For more +about using multiple DVC repos in parallel, check out some discussions +[here](https://discuss.dvc.org/t/setup-dvc-to-work-with-shared-data-on-nas-server/180) +and +[here](https://dvc.org/doc/use-cases/fast-data-caching-hub#example-shared-development-server). + +### Q: [What are some strategies for reproducibility if parts of our model training pipeline are run on our organizations's HPC?](https://discordapp.com/channels/485586884165107732/485596304961962003/652380507832844328) + +Using DVC for version control is entirely compatible with using remote computing +resources, like high performance computing (HPC), in your model training +pipeline. We think a great example of using DVC with parallel computing is +provided by [Peter Fogh](http://www.peterfogh.dk/) Take a +[look at his repo](https://github.com/PeterFogh/dvc_dask_use_case) for a +detailed use case. Please keep us posted about how HPC works in your pipeline, +as we'll be eager to pass on any insights to the community. + +### Q: Say I have a Git repository with multiple projets inside (one classification, one object detection, etc.). [Is it possible to tell DVC to just pull data for one particular project?](https://discordapp.com/channels/485586884165107732/563406153334128681/646760832616890408) + +Absolutely, DVC supports pulling data from different DVC files. An example would +be having two project subdirectories in your Git repo, `classification` and +`detection`. You could use `dvc pull -R classification` to only pull files in +that project to your workspace. + +If you prefer to be even more granular, you can `dvc add` files individually. +Then you can use `dvc pull .dvc` to retrieve the outputs specified +only by that file. + +### Q: [Is it possible to set an S3 remote without the use of AWS credentials with DVC?](https://discordapp.com/channels/485586884165107732/563406153334128681/623234659098296348) I want to publicly host a dataset so that everybody who clones my code repo can just run `dvc pull` to fetch the dataset. + +Yes, and we love the idea of publicly hosting a dataset. There are a few ways to +do it with DVC. We use one method in our own DVC project repository on Github. +If you run `git clone https://github.com/iterative/dvc` and then `dvc pull`, +you’ll see that DVC is downloading data from an HTTP repository, which is +actually just an S3 repository that we've granted public HTTP read-access to. + +So you would need to configure two remotes in your config file, each pointing to +the same S3 bucket through different protocols. Like this: + +```dvc +$ dvc remote add -d --local myremote s3://bucket/path +$ dvc remote add -d mypublicemote http://s3-external-1.amazonaws.com/bucket/path +``` + +Here's why this works: the `-d` flag sets the default remote, and the `--local` +flag creates a set of configuration preferences that will override the global +settings when DVC commands are run locally and won't be shared through Git (you +can read more about this +[in our docs](https://dvc.org/doc/command-reference/remote/add)). + +This means that even though you and users from the public are accessing the +stored dataset by different protocols (S3 and HTTPS), you'll all run the same +command: `dvc pull`. diff --git a/content/blogs/2020-02-04-gsoc-ideas-2020.md b/content/blogs/2020-02-04-gsoc-ideas-2020.md new file mode 100644 index 0000000000..671b2e72a8 --- /dev/null +++ b/content/blogs/2020-02-04-gsoc-ideas-2020.md @@ -0,0 +1,130 @@ +--- +title: Join DVC for Google Summer of Code 2020 +date: 2020-02-04 +description: > + A call for student applications for Google Summer of Code 2020. +descriptionLong: > + DVC is looking for students to take part in [Google Summer of Code + 2020](https://summerofcode.withgoogle.com/). +picture: 2020-02-04/Summer_of_Code_small.png +pictureComment: +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/join-dvc-for-google-summer-of-code/317 +tags: + - Google Summer of Code + - Students + - Mentoring + - Company +--- + +Announcement, announcement! After a successful experience with +[Google Season of Docs](https://developers.google.com/season-of-docs) in 2019, +we're putting out a call for students to apply to work with DVC as part of +[Google Summer of Code](https://summerofcode.withgoogle.com/). If you want to +make a dent in open source software development with mentorship from our team, +read on. + +## Prerequisites to apply + +Besides the general requirements to apply to Google Summer of Code, there are a +few skills we look for in applicants. + +1. **Python experience.** All of our core development is done in Python, so we + prefer candidates that are experienced in Python. However, we will consider + applicants who are very strong in another language and familiar with Python + basics. +2. **Git experience.** Git is also a key part of DVC development, as DVC is + built around Git; that said, for certain projects (rated as “Beginner”) a + surface-level knowledge of Git will be sufficient. +3. **People skills.** Beyond technical fundamentals, we put a high value on + communication skills: the ability to report and document your experiments and + findings, to work kindly with teammates, and explain your goals and work + clearly. + +If you like our mission but aren't sure if you're sufficiently prepared, please +be in touch anyway. We'd love to hear from you. + +## Project ideas + +Below are several project ideas that are an immediate priority for the core DVC +team. Of course,we welcome students to create their own proposals, even if they +differ from our ideas. Projects will be primarily mentored by co-founders +[Dmitry Petrov](https://github.com/dmpetrov) and +[Ivan Shcheklein](https://github.com/shcheklein). + +1. **Migrate to the latest v3 API to improve Google Drive support.** Our + organization is a co-maintainer of the PyDrive library in collaboration with + a team at Google. The PyDrive library is now several years old and still + relies on the v2 protocol. We would like to migrate to v3, which we expect + will boost performance for many DVC use cases (e.g. the ability to filter + fields being retrieved from our API, etc). For this project, we’re looking + for a student to work with us to prepare the next major version of the + PyDrive library, as well as making important changes to the core DVC code to + support it. Because PyDrive is broadly used outside of DVC, this project is a + chance to work on a library of widespread interest to the Python community. +

_Skills required:_ Python, Git, experience with APIs
+ _Difficulty rating:_ Beginner-Medium
+ +2. **Introducing parallelism to DVC.** One of DVC’s features is the ability to + create pipelines, linking data repositories with code to process data, train + models, and evaluate model metrics. Once a DVC pipeline is created, the + pipeline can be shared and re-run in a systematic and entirely reproducible + way. Currently, DVC executes pipelines sequentially, even though some steps + may be run in parallel (such as data preprocessing). We would like to support + parallelization for pipeline steps specified by the user. Furthermore, we’ll + need to support building flags into DVC commands that specify the level of + parallelization (CPU, GPU or memory).

_Skills required:_ + Python, Git. Some experience with parallelization and/or scientific computing + would be helpful but not required.
_Difficulty rating:_ Advanced +
+ +3. **Developing use cases for data registries and ML model zoos.** A new DVC + functionality that we’re particularly excited about is `summon`, a method + that can turn remotely-hosted machine learning artifacts such as datasets, + trained models, and more into objects in the user’s local environment (such + as a Jupyter notebook). This is a foundation for creating data catalogs of + data-frames and machine learning model zoos on top of Git repositories and + cloud storages (like GCS or S3). We need to identify and implement model zoos + (think PyTorch Hub, the Caffe Model Zoo, or the TensorFlow DeepLab Model Zoo) + and data registries for types that are not supported by DVC yet. Currently, + we’ve tested `summon` with PyTorch image segmentation models and Pandas + dataframes. We’re looking for students to explore other possible use cases. +

_Skills required:_ Python, Git, and some machine learning or + data science experience
_Difficulty rating:_ Beginner-Medium
+ +4. **Continuous delivery for JetBrains TeamCity.** Continuous integration and + continuous delivery (CI/CD) for ML projects is an area where we see + [DVC make a big impact](https://martinfowler.com/articles/cd4ml.html)- + specifically, by delivering datasets and ML models into CI/CD pipelines. + While there are many cases when DVC is used inside GitHub Actions and GitLab + CI, you will be transferring this experience to another type of CI/CD system, + [JetBrains TeamCity](https://www.jetbrains.com/teamcity/). We're working to + integrate DVC's model and dataset versioning into TeamCity's CI/CD toolkit. + This project would be ideal for a student looking to explore the growing + field of MLOps, an offshoot of DevOps with the specifics of ML projects at + the center.

_Skills required:_ Python, Git, bash scripting. It + would be nice, but not necessary, to have some experience with CI/CD tools + and developer workflow automation.
_Difficulty rating:_ + Medium-Advanced
+ +5. **DVC performance testing framework.** Performance is a core value of DVC. We + will be creating a performance monitoring and testing framework where new + scenarios (e.g., unit testing)can be populated. The framework should reflect + all performance improvements and degradations for each of the DVC releases. + It would be especially compelling if testing could be integrated with our + GitHub workflow (CI/CD). This is a great opportunity for a student to learn + about DVC and versioning in-depth and contribute to its stability.
+
_Skills required:_ Python, Git, bash scripting.
_Difficulty + rating:_ Medium-Advanced
+ +## If you'd like to apply + +Please refer to the +[Google Summer of Code](https://summerofcode.withgoogle.com/) application guides +for specifics of the program. Students looking to know more about DVC, and our +worldwide community of contributors, will learn most by visiting our +[Discord channel](https://dvc.org/chat), +[GitHub repository](https://github.com/iterative/dvc), and +[Forum](https://discuss.dvc.org/). We are available to discuss project proposals +from interested students and can be reached by [email](mailto:support@dvc.org) +or on our Discord channel. diff --git a/content/blogs/2020-02-10-february-20-dvc-heartbeat.md b/content/blogs/2020-02-10-february-20-dvc-heartbeat.md new file mode 100644 index 0000000000..520c91d271 --- /dev/null +++ b/content/blogs/2020-02-10-february-20-dvc-heartbeat.md @@ -0,0 +1,146 @@ +--- +title: February ’20 DVC❤️Heartbeat +date: 2020-02-10 +description: > + DVC talks around the world, new team members, and full-stack machine learning. +descriptionLong: > + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. + + Look here for updates about [DVC](https://dvc.org), our journey as a startup, + projects by our users and big ideas about best practices in ML and data + science. +picture: 2020-02-10/heartbeat_black.png +pictureComment: + Just in time for Valentine's day, here's a seasonally-relevant DVC pipeline. +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/dvc-heartbeat-feburary-20/318 +tags: + - Heartbeat + - CI/CD +--- + +Welcome to the February Heartbeat! This month's featured image is a DVC pipeline +[created by one of our users](https://medium.com/nlp-trend-and-review-en/use-dvc-to-version-control-ml-dl-models-bef61dbfe477), +which _we_ think resembles a valentine. Here are some more highlights from our +team and our community: + +## News + +**Our team is growing!** In early January, DVC gained two new folks: engineer +[Saugat Pachhai](https://github.com/skshetry) and data scientist +[Elle O'Brien](https://twitter.com/andronovhopf). Saugat, based in Nepal, will +be contributing to core DVC. Elle (that's me!), currently in San Francisco, will +be leading data science projects and outreach with DVC. + +We're **gearing up for a spring full of talks** about DVC projects, including +new up-and-coming features for data cataloging and continuous integration. Here +are just a few events that have been added to our schedule: + + + + + + + +-Elle O'Brien was recently accepted to give a keynote at +[Women in Data Science](https://www.widsconference.org/) San Diego on May 9. The +talk is called "Packaging data and machine learning models for sharing." + +-Elle will also be speaking at [Div Ops](https://divops.org/), a new online +conference about (you guessed it) DevOps, on March 27. + +Look out for more conference announcements soon- in our **brand new community +page!** We've [just launched a new hub](https://dvc.org/community) for sharing +events, goings-ons, and ways to contribute to DVC. + +## From the community + +Our users continue to put awesome things on the internet. Like this AI blogger +who isn't afraid to wear his heart on his sleeve. + + + +Musa Atlihan writes: + +> From my experience, whether it is a real-world data science project or it is a +> data science competition, there are two major key components for success. +> Those components are API simplicity and reproducible pipelines. Since data +> science means experimenting a lot in a limited time frame, first, we need +> machine learning tools with simplicity and second, we need +> reliable/reproducible machine learning pipelines. Thanks to tools like Keras, +> LightGBM, and fastai we already have simple yet powerful tools for rapid model +> development. And thanks to DVC, we are building large projects with +> reproducible pipelines very easily. + +It's cool how Musa puts DVC in context with libraries for model building. In a +way, the libraries that have made it easier than ever to iterate through +different model architectures have increased the need for reproducibility in +proportion. + +Meanwhile in Germany, superusers Marcel Mikl and Bert Besser wrote +[another](https://blog.codecentric.de/en/2019/03/walkthrough-dvc/) seriously +comprehensive article about DVC for Codecentric. Marcel and Bert walk readers +through the steps to **build a custom machine learning training pipeline with +remote computing resources** like GCP and AWS. It's an excellent guide to +configuring model training with attention to _automation_ and _collaboration_. +We give them 🦉🦉🦉🦉🦉 out of 5. + + + +Here are a few more stories on our radar: + +- **AI Singapore shares their method for AI development and deployment.** This + .. + [blog about how Agile informs their processes](https://makerspace.aisingapore.org/2020/01/agile-ai-engineering-in-aisg/) + for continuous integration and delivery includes data versioning. + +- **Toucan AI dispenses advice for ML engineers.** This .. + [blog for practitioners](https://toucanai.com/blog/post/building-production-ml/) + discusses questions like, "When to work on ML vs. the processes that surround + ML". It covers how DVC is used for model versioning in the exploration stage + of ML. + +- **DVC at the University.** A recent .. + [pre-print from natural language processing researchers at Université Laval](https://arxiv.org/pdf/1912.01706.pdf) + explains how DVC facilitated dataset access for collaborators. + + > "In our case, the original dataset takes up to 6 Gigabytes. The previous way + > of retrieving the dataset over the network with a standard 20 Mbits/sec + > internet connexion took up to an hour to complete (including uncompressing + > the data). Using DVC reduced the retrieval time of the dataset to 3 minutes + > over the network with the same internet connexion." + + Thanks for sharing- this is a lovely result. Oh, and last... + +- **DVC is a job requirement**! We celebrated a small milestone when we stumbled + .. across a listing for a data engineer to support R&D at + [Elvie](https://www.elvie.com/en-us/), a maker of tech for women's health + (pretty neat mission). The decorations on the job posting are ours 😎 + +![](../uploads/images/2020-02-10/elvie.png)_A job advertisement featuring DVC._ diff --git a/content/blogs/2020-02-17-a-public-reddit-dataset.md b/content/blogs/2020-02-17-a-public-reddit-dataset.md new file mode 100644 index 0000000000..ff161e825e --- /dev/null +++ b/content/blogs/2020-02-17-a-public-reddit-dataset.md @@ -0,0 +1,327 @@ +--- +title: + AITA for making this? A public dataset of Reddit posts about moral dilemmas +date: 2020-02-17 +description: > + Releasing an open natural language dataset based on r/AmItheAsshole. +descriptionLong: > + Delve into an open natural language dataset of posts about moral dilemmas from + [r/AmItheAsshole](https://www.reddit.com/r/AmItheAsshole/). Use this dataset + for whatever you want- here's how to get it and start playing. +picture: 2020-02-17/post_header_gmoji.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/aita-for-making-this-a-public-dataset-of-reddit-posts-about-moral-dilemmas/323 +tags: + - Project + - Data + - Reddit + - Tutorial +--- + +In data science, we frequently deal with classification problems like, _is this +[Yelp reviewer unhappy](https://www.ics.uci.edu/~vpsaini/) with their brunch? Is +[this email](https://archive.ics.uci.edu/ml/datasets/spambase) begging me to +claim my long-lost inheritance spam? Does this +[movie critic](http://ai.stanford.edu/~amaas/data/sentiment/) have a positive +opinion of Cats?_ + +Perhaps we should also consider the fundamental introspective matter of, _am I +maybe being a bit of an asshole?_ + +I want to share a dataset of collected moral dilemmas shared on Reddit, as well +as the judgments handed down by a jury of Redditors. The wellspring of this data +is the [r/AmItheAsshole](https://www.reddit.com/r/AmItheAsshole/) subreddit, one +of the natural wonders of the digital world. In this article, I'll show you +what's in the dataset, how to get it, and some things you can do to move the +frontiers of Asshole research forward. + +## What makes an Asshole? + +r/AmItheAsshole is a semi-structured online forum that’s the internet’s closest +approximation of a judicial system. In this corner of the web, citizens post +situations from their lives and Redditors vote to decide if the writer has acted +as The Asshole or not. For example: + +![](../uploads/images/2020-02-17/aita_sample.png) + +Without bringing any code into the picture, it’s intuitive to think of each new +post as a classification task for the subreddit. Formally, we could think of the +subreddit as executing a function _f_ such that + +![](../uploads/images/2020-02-17/aita_formula.png '=500') + +Of course, finding f won’t be trivial. To be frank, I’m not positive how well we +could hope to forecast the rulings of the subreddit. A lot of posts are not easy +for me to decide- like, + +![](../uploads/images/2020-02-17/aita_llama.png) + +There are also many times I find myself disagreeing with the subreddit’s +verdict. All this is to say, I don’t think it’s obvious how well a given human +would do on the task of predicting whether Redditors find someone an Asshole. +Nor is it clear how well we could ever hope for a machine to do approximating +their judgment. + +It seems fun to try, though. It helps that the data is plentiful: because the +subreddit is popular and well-moderated, there’s an especially strong volume of +high-quality content (re: on-topic and appropriately formatted) being posted +daily. + +## Building the dataset + +I pulled content from r/AmITheAsshole dating from the first post in 2012 to +January 1, 2020 using the [pushshift.io](https://pushshift.io/) API to get post +ids and +[scores](https://www.reddit.com/wiki/faq#wiki_how_is_a_submission.27s_score_determined.3F), +followed by Reddit’s API ([praw](https://praw.readthedocs.io/en/latest/)) to get +post content and meta-data. Using a +[similar standard as OpenAI](https://openai.com/blog/better-language-models/) +for trawling Reddit, I collected text from posts with scores of 3 or more only +for quality control. This cut the number of posts from ~355K to ~111K. Each data +point contains an official id code, timestamp, post title, post text, verdict, +score, and comment count; usernames are not included. The scraping and cleaning +code is available +[in the project GitHub repo](https://github.com/iterative/aita_dataset). For +simplicity on the first iteration of this problem, I didn’t scrape post +comments, which can number in the thousands for popular posts. But, should +sufficient interest arise, I’d consider adding them to the dataset in some form. + +To focus on the task of classifying posts, I did some light cleaning: I removed +posts in which the body of the text was redacted (surprisingly common) or blank, +and attempted to remove edits where the author had clearly given away the +verdict (e.g., an edit that says, “Update: You’re right, I was the asshole.”). +There were also verdicts that only occurred once (“cheap asshole”, “Crouching +Liar; hidden asshole”, “the pizza is the asshole”), so I restricted the dataset +to posts with standard verdicts. This left ~63K points. Below is a sample of the +resulting dataframe: + +![](../uploads/images/2020-02-17/df_sample.png)_Click to enlarge._ + +The dataset is a snapshot of the subreddit in its current state, but the +subreddit is certain to change over time as new content gets added. In the +interest of having the most comprehensive dataset about being an asshole ever +collected, _I’m planning to update this dataset monthly with new posts._ + +## How to get the dataset + +Since this dataset will be updated regularly, we’re using git and DVC to +package, version, and release it. The data itself is stored in an S3 bucket, and +you can use DVC to import the data to your workspace. If you haven't already +you'll need to [install DVC](https://dvc.org/doc/install); one of the simplest +ways is `pip install dvc`. + +Say you have a directory on your local machine where you plan to build some +analysis scripts. Simply run + +```dvc +$ dvc get https://github.com/iterative/aita_dataset \ + aita_clean.csv +``` + +This will download a .csv dataset into your local directory, corresponding to +the cleaned version. If you wanted the raw dataset, you would substitute +`aita_raw.csv` for `aita_clean.csv`. + +Because the dataset is >100 MB, I’ve created a git branch (called “lightweight”) +with 10,000 randomly sampled (cleaned) data points for quick-and-dirty +experimentation that won’t occupy all your laptop’s memory. To download only +this smaller dataset, run + +```dvc +$ dvc get --rev lightweight \ + https://github.com/iterative/aita_dataset \ + aita_clean.csv +``` + +## A quick look at the data + +Let’s take a flyover look at the dataset so far. The code to make the following +visuals and results is +[available on GitHub](https://github.com/andronovhopf/aita_viz_and_classify). +First, here’s a frequency plot for how common different verdicts are on the +subreddit. In addition to “Asshole” and “Not the Asshole”, there are two +additional rulings: “Everybody Sucks” and “No Assholes Here”. + +![](../uploads/images/2020-02-17/freq_plot.svg) + +In general agreement with an +[analysis by Nathan Cunn](http://www.nathancunn.com/2019-04-04-am-i-the-asshole/), +the majority of posts are deemed “Not the Asshole” or “No Assholes Here”. If you +are posting on r/AmITheAsshole, you are probably not the asshole. + +Next, I attempted a very basic classifier, logistic regression using 1-gram +frequencies (i.e., the frequency of word occurences in post titles and bodies) +as features. This is intended to give a baseline for what kind of performance +any future modeling efforts should beat. Because of the strong class imbalance, +I used +[SMOTE to oversample](https://imbalanced-learn.org/stable/over_sampling.html#smote-variants) +Asshole posts. And, for simplicity, I binarized the category labels: + +| Verdict | Label | +| :--------------: | :---: | +| Asshole | 1 | +| Everyone Sucks | 1 | +| Not the Asshole | 0 | +| No Assholes Here | 0 | + +With 5-fold cross-validation, this classifier performed above-chance but +modestly: accuracy was 62.0% +/- 0.005 (95% confidence interval). Curiously, the +only other classifier attempt I could find online +[reported 61% accuracy on held-out data](https://github.com/amr-amr/am-i-the-asshole) +using the much more powerful BERT architecture. Considering that logistic +regression has zero hidden layers, and our features discard sequential +information entirely, we’re doing quite well! Although I can’t be certain, I’m +curious how much the discrepancy comes down to dataset size: the previous effort +with BERT appears to be trained on ~30K posts. + +Seeing that logistic regression on word counts doesn’t produce total garbage, I +looked at which words were predictive of class using the +[chi-squared test](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html). +The top five informative words were mom, wife, mother, edit, and dad (looks like +Assholes go back to edit their posts). Since familial relationships featured +prominently, I +[estimated the log odds ratio](https://www.tidytextmining.com/twitter.html#comparing-word-usage) +of being voted Asshole (versus Not the Asshole) if your post mentions a mom, +dad, girlfriend/wife or boyfriend/husband. Roughly, the log odds ratio +represents the difference in probability of a keyword occurring in Asshole posts +compared to Not-Asshole posts. + +![](../uploads/images/2020-02-17/svg_kw2.svg) + +For reference, the log odd ratios are computed with base 2; a score of 1 means +that Asshole posts are twice as likely to contain the keyword as Not the Asshole +posts. So keep in mind that the effect sizes we’re detecting, although almost +certainly non-zero, are still fairly small. + +There seems to be a slight anti-parent trend, with Redditors being more likely +to absolve authors who mention a mom or dad. Only mentioning a female romantic +partner (wife/girlfriend) was associated with a greater likelihood of being +voted the Asshole. This surprised me. My unsubstantiated guess about the gender +difference in mentioning romantic partners is that women may be particularly +likely to question themselves when they act assertively in a relationship. If +this were the case, we might find an especially high proportion of +uncontroversial “Not the Asshole” posts from heterosexual women asking about +situations with their male partners. + +## How to get more data + +As I said earlier, the plan is to grow the dataset over time. I’ve just run a +new scrape for posts from January 1-31, 2020 and am adding them to the public +dataset now. To check for a new release, you can re-run the `dvc get` command +you used to grab the dataset. + +If you’re serious about taking on a project such as, say, building a classifier +that beats our state of the art, word-count-based, logistic regression model, +I’d like to recommend a better way to integrate the dataset into your workflow: +`dvc import`. `dvc import` is like `dvc get`, but it preserves a link to the +hosted data set. This is desirable if you might iterate through several +experiments in the search for the right architecture, for example, or think +you’ll want to re-train a model . To get the dataset the first time, you’ll run: + +```dvc +$ git init +$ dvc init +$ dvc import https://github.com/iterative/aita_dataset \ + aita_clean.csv +``` + +Then, because the dataset in your workspace is linked to our dataset repository, +you can update it by simply running: + +```dvc +$ dvc update aita_clean.csv +``` + +An additional benefit of codifying the link between your copy of the dataset and +ours is that you can track the form of the dataset you used at different points +in your project development. You can jump back and forth through the project +history then, not only to previous versions of code but also to versions of +(specifically, links to) data. For example, you could roll back the state of the +project to before you updated the dataset and re-run your classifier: + +```dvc +$ git log --oneline +58e28a5 retrain logistic reg +6a44161 update aita dataset +0de4fc3 try logistic regression classifier +a266f15 get aita dataset +55031b0 first commit + +$ git checkout 0de4fc3 +$ dvc checkout +$ python train_classifier.py +``` + +Oh, and one more note: you can always use `dvc get` and `dvc import` to grab an +older version of the dataset using the tags associated with each release. The +current release is v.20.1 and the original release is v.20.0- the numeric codes +correspond to the year and month. + +```dvc +$ dvc get --rev v.20.0 \ + https://github.com/iterative/aita_dataset aita_clean.csv +``` + +## What’s next + +I hope that sharing this evolving dataset invites some curiosity, because a lot +of questions come to mind: + +1. Can you beat our classifier that predicts how the subreddit will rule? +2. Is verdict even the most interesting outcome to predict? For example, + developer Scott Ratigan + [created a tool to estimate weighted scores](https://github.com/scotteratigan/amitheahole) + for each post based on the comments (e.g., 75% Asshole, 25% Not the Asshole). + What metrics might invite deeper questions? +3. Can you identify sentences or phrases that are most informative about the + verdict Redditors reach? +4. Do voting patterns systematically differ by topic of discussion? +5. How reliable are verdicts? When a very similar situation is posted multiple + times, do Redditors usually vote the same way? +6. Is the subreddit’s posting and voting behavior changing over time? +7. Can you formulate any testable hypotheses based on + [this survey of the subreddit’s demographics](https://www.reddit.com/r/AmItheAsshole/comments/dcae07/2019_subscriber_survey_data_dump/?) +8. How often do non-Redditors agree with the subreddit? Under what circumstances + might they tend to disagree? + +I expect that leaning into the particulars of the dataset- thinking about how +the format influences the content, and how a subreddit might select for +participants that don’t fully represent the population at large- will lead to +more interesting questions than, say, aiming to forecast something about +morality in general. To put it another way, the data’s not unbiased- so maybe +try to learn something about those biases. + +If you make something with this dataset, please share- perhaps we can form an +international Asshole research collective, or at least keep each other appraised +of findings. And of course, reach out if you encounter any difficulties or +probable errors (you can file issues +[on the GitHub repo](https://github.com/iterative/aita_dataset))! + +Lastly, please stay tuned for more releases- there are hundreds of new posts +every day. The biggest asshole may still be out there. + +
+ +### More resources + +You may want to check out a few more efforts to get at r/AmItheAsshole from a +data-scientific perspective, including +[topic modeling](https://medium.com/@tom.gonda/what-does-reddit-argue-about-28432b11ea26), +[visualizing voting patterns](http://www.nathancunn.com/2019-04-04-am-i-the-asshole/) +and +[growth of the subreddit](https://twitter.com/felipehoffa/status/1223278090958209025), +and +[classification](https://www.informatik.hu-berlin.de/de/forschung/gebiete/wbi/teaching/studienDiplomArbeiten/finished/2019/expose_fletcher.pdf) +with [deep learning](https://github.com/amr-amr/am-i-the-asshole). With a +dataset this rich, there’s much more to be investigated, including continuing to +refine these existing methods. And there’s almost certainly room to push the +state of the art in asshole detection! + +If you're interested in learning more about using Reddit data, check out +[pushshift.io](https://pushshift.io/), a database that contains basically all of +Reddit's content (so why make this dataset? I wanted to remove some of the +barriers to analyzing text from r/AmItheAsshole by providing an +already-processed and cleaned version of the data that can be downloaded with a +line of code; pushshift takes some work). You might use pushshift's API and/or +praw to augment this dataset in some way- perhaps to compare activity in this +subreddit with another, or broader patterns on Reddit. diff --git a/content/blogs/2020-02-19-february-20-community-gems.md b/content/blogs/2020-02-19-february-20-community-gems.md new file mode 100644 index 0000000000..72c272a684 --- /dev/null +++ b/content/blogs/2020-02-19-february-20-community-gems.md @@ -0,0 +1,152 @@ +--- +title: February '20 Community Gems +date: 2020-02-19 +description: > + Great discussions and technical Q&A's from our users. +descriptionLong: > + Look here every month for great discussions and technical Q&A's from our users + and core development team. +picture: 2020-02-19/feb20_gems_header_gr.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/feb-20-community-gems/330 +tags: + - Google Drive + - Azure + - Community Gems + - Homebrew +--- + +## Discord gems + +Welcome to the February roundup of useful, intriguing, and good-to-know +discussions going on with DVC users and developers. Let's dive right in with +some questions from our Discord channel. + +### Q: [If I have multiple outputs from a DVC pipeline and only want to checkout one, what command would I run?](https://discordapp.com/channels/485586884165107732/563406153334128681/670233820326264843) + +By default, `dvc checkout` is written for a +[Git-like experience](https://dvc.org/doc/command-reference/checkout), meaning +that it will sync your local workspace with all the model files, dependencies, +and outputs specified by a project's `.dvc` files. If you only want to access +one artifact from the project, you can do this with +`dvc checkout `. This will deliver the specified file to your +workspace. + +If you're interested in sharing specific artifacts (like data files or model +binaries) with other users, you might also consider `dvc get` and `dvc import`. +These functions are ideal for downloading a single file (or a few files) to the +local workspace, instead of the whole project. + +### Q: [I have a complicated use case.](https://discordapp.com/channels/485586884165107732/563406153334128681/668773484549242890) We're trying to set up a system where users act as data scientists. They'd select data, which would be cleaned/transformed in the backend, and experiment with model hyperparameters until they're happy with the model result. Then they can "save" the model, including artifacts like the input data used, metrics, and binary model file, placing the experiment under version control. Later they can "load" the model again and select new input data from our database, change parameters, and "update it". There might be hundreds of separate models. Can DVC do this? + +Most of this functionality is supported by DVC already. We recommend +`dvc import` as a method for giving users access to data in a repostiory (and +also check out our +[tutorial on data registries](https://dvc.org/doc/use-cases/data-registries)). +For pre-processing data, +[DVC pipelines](https://dvc.org/doc/get-started/pipeline) can automate a +procedure for transforming and cleaning inputs (i.e., you can use bash scripts +to `dvc run` the pipeline whenever a user selects a dataset). Saving the +workspace after experimentation, including model files, metrics, and outputs, is +a core function of DVC (see `dvc add` and `dvc push` functions). We also have a +[Python API](https://dvc.org/doc/use-cases/data-registries#programatic-reusability-of-dvc-data) +so users can load artifacts like datasets and model files into their local +Python session. When they're done experimenting, they can `dvc add` and +`dvc push` their progress. Users can later "pull" a saved workspace and all +associated files using `dvc checkout` + +As for how to organize hundreds of separate experiments, we're still evolving +our strategy and best-practice recommendations. It's conceivable that each +experiment could be carried out and saved on a separate branch of a project +repository. Our thoughts about structuring version control around architecture +search and hyperparameter tuning could fill up a whole blog (and probably will +in the not-so-distant future); check out one of our +[recent conversation threads](https://github.com/iterative/dvc/issues/2799) if +you'd like to see where we're currently at. And please let us know how your use +case goes—at this stage, we'd love to hear what works for you. + +### Q: [What's the difference](https://discordapp.com/channels/485586884165107732/563406153334128681/666708671333400599) between `config` and `config.local` files? Is it safe to do git commit without including my config file? + +There are indeed two kinds of config files you might come across in your project +directory's `.dvc` folder and `.gitignore` file. The key difference is that +`config` is intended to be committed to Git, while `config.local` is not. You'd +use `config.local` to store sensitive information (like personal credentials for +SSH or another kind of authenticated storage) or settings specific to your local +environment—things you wouldn't want to push to a GitHub repo. DVC only modifies +`config.local` when you explicitly use the `--local` flag in the `dvc config` or +`dvc remote *` commands, so outside of these cases you shouldn't have to worry +about it. + +As for using `git commit` without the `config` file, it is safe. _But_ you +should check if there are any settings in `config.local` that you actually want +to save to `config`. This would be rare, since as we mentioned, you'd only have +settings in `config.local` if you expressly called for them with the `--local` +flag. + +### Q: I have an Azure storage account container, and the only link I can see in my Azure portal for the container is an `http://` link. But the tutorial on DVC shows Azure storage accessed with the `azure://` protocol. [Which is right?](https://discordapp.com/channels/485586884165107732/563406153334128681/675087897661276169) + +What you're describing is exactly as it should be. `azure://` is an internal URL +protocol that tells DVC which API to use to connect to your remote storage, not +the exact address of your Blob. You can use the format +`azure:///`. For more details, you can refer to +our documentation about +[supported storage types](https://dvc.org/doc/command-reference/remote/add#supported-storage-types). + +### Q: [I'm using DVC to version my data with Google Drive storage.](https://discordapp.com/channels/485586884165107732/563406153334128681/667198775361536019) If I want a developer to be able to download the data, can I give them my `gdrive_client_id` and `gdrive_client_secret`, or maybe give them permission to access my Google Drive folder? + +For Google Drive, `gdrive_client_id` and `gdrive_client_secret` aren't used to +access a specific user's Google Drive disk; they're predominantly used by +Google's API to +[track usage and set appropriate rate limits](https://rclone.org/drive/#making-your-own-client-id). +So the risk in sharing them is not that your personal files will be vulnerable, +but that your API usage limits could be negatively affected if others are using +it with your credentials. Whether this risk is acceptable is up to you. It's not +unusual for teams and organizations to share a set of credentials, so a +reasonable level of security may mean ensuring that the `config` file for your +project (which typically contains Google Drive credentials) is only visible to +team members. + +Please check out our +[docs about Google Drive](https://dvc.org/doc/user-guide/setup-google-drive-remote), +too, for more about how DVC uses the Google Drive API. + +### Q: I just tried to upgrade DVC via `homebrew` and got a "SHA256 mismatch" error. [What's going on](https://discordapp.com/channels/485586884165107732/485596304961962003/672930535261339669)? + +What most likely happened is that you first installed DVC via +`brew install iterative/homebrew-dvc/dvc`, which is no longer supported—because +DVC is now a core Homebrew formula! Please uninstall and reinstall using +`brew install dvc` for uninterrupted upgrades in the future. + +### Q: [I still can't convince myself to version-control the data rather than meta-data.](https://www.reddit.com/r/datascience/comments/aqkg59/does_anyone_use_data_version_control_dvc_thoughts/eq62lkt?utm_source=share&utm_medium=web2x) Can anyone give me a strong argument against version controlling data file paths in config files instead of using DVC? + +_This question is from a [Reddit discussion.](https://bit.ly/38HOEcj)_ + +Versioning the meta-data associated with your dataset is certainly a workable +strategy. You can use prefixes and suffixes to distinguish models trained on +different versions of data, and keep your data files in one `.gitignored` +directory. That may be enough for some projects. In our experience, though, +we've found this comes with a host of complications that don't scale well: + +1. You'll have to write custom code to support this configuration, specifying + filepaths to your dataset with hardcoded links. +2. For files that are outputs of your analysis pipeline, you'll need to agree on + conventions for suffixes/prefixes for naming to specify which version of the + dataset was used. +3. Depending on the meta-data you use to version data files, you may not detect + changes made by users. Even if you can tell a change has occurred, you may + not be able to track _who_ did it _when_. + +We designed DVC to optimize data management from the user's perspective: users +can change the dataset version without changing their code, so organizations +don't have to adhere to explicit filenaming conventions and hardcoded links that +are prone to human error. Furthermore, versioning data similar to how Git +versions code provides a largely immutable record of every change that has +occurred. We think this is important as teams and projects grow in complexity. +And from a systems-level perspective, DVC does more than track data: it +dedpulicates files behind the scenes, provides simple interfaces for sharing +datasets (and models!) with collaborators and users, and connects specific model +files with the dataset versions they were trained on. + +To summarize, DVC is not the only way to version your data. But we think it's +one way to reduce the overhead of managing data infrastructure when your project +involves experimentation or collaboration. diff --git a/content/blogs/2020-03-11-march-20-dvc-heartbeat.md b/content/blogs/2020-03-11-march-20-dvc-heartbeat.md new file mode 100644 index 0000000000..9d7270d053 --- /dev/null +++ b/content/blogs/2020-03-11-march-20-dvc-heartbeat.md @@ -0,0 +1,138 @@ +--- +title: March ’20 DVC❤️Heartbeat +date: 2020-03-11 +description: > + DVC discussions around the web, our growing team, and recommended reading from + the open source community. +descriptionLong: > + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. + + Look here for updates about [DVC](https://dvc.org), our journey as a startup, + projects by our users and big ideas about best practices in ML and data + science. +picture: 2020-03-11/March_20_HB_header.png +pictureComment: +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/march-20-heartbeat/335 +tags: + - Heartbeat + - CI/CD + - Book + - Monorepo + - New feature +--- + +Welcome to the March Heartbeat! Here are some highlights from our team and +community this past month: + +## News + +**DVC is STILL growing!** In February, Senior Software Engineer +[Guro Bokum](https://www.linkedin.com/in/jiojiajiu/) joined DVC. He's previously +contributed to the core DVC code base and brings several years of full-stack +engineering expertise to the team. Welcome, Guro! + +![](../uploads/images/2020-03-11/hi_guro.png 'Img=500x667')_Welcome, Guro!_ + +**New feature alert.** We've received many requests for +[monorepo](https://en.wikipedia.org/wiki/Monorepo) support in DVC. As of DVC +[release 0.87.0](https://github.com/iterative/dvc/releases), users can version +data science projects within a monorepo! The new `dvc init --subdir` +functionality is designed to allow multiple DVC repositories within a single Git +repository. Don't forget to upgrade and +[check out the latest docs](https://dvc.org/doc/command-reference/init). + +## From the community + +First, there's an intriguing +[discussion evolving in the DVC repo](https://github.com/iterative/dvc/issues/3393) +about how machine learning hyperparameters (such as learning rate, number of +layers in a deep neural network, etc.) can be tracked. Right now, +hyperparameters are tracked as source code (i.e., with Git). Could we use some +kind of abstraction to separate hyperparameters from source code in a +DVC-managed project? Read on and feel free to jump into this discussion, largely +helmed by software developer and DVC contributor +[Helge Munk Jacobsen](http://elgehelge.github.io/). + +Another discussion we appreciated happened on Twitter: + + + +Thanks, [@cyberomin](https://twitter.com/cyberomin)! + +Elsewhere on the internet, DVC made the cut in a much-shared blog, +[Five Interesting Data Engineering Projects](https://medium.com/@squarecog/five-interesting-data-engineering-projects-48ffb9c9c501) +by [Dmitry Ryaboy](https://twitter.com/squarecog) (VP of Engineering at biotech +startup Zymergen, and formerly Twitter). Dmitry wrote: + +> To be honest, I’m a bit of a skeptic on “git for data” and various automated +> data / workflow versioning schemes: various approaches I’ve seen in the past +> were either too partial to be useful, or required too drastic a change in how +> data scientists worked to get a realistic chance at adoption. So I ignored, or +> even explicitly avoided, checking DVC out as the buzz grew. I’ve finally +> checked it out and… it looks like maybe this has legs? Metrics tied to +> branches / versions are a great feature. Tying the idea of git-like branches +> to training multiple models makes the value prop clear. The implementation, +> using Git for code and datafile index storage, while leveraging scalable data +> stores for data, and trying to reduce overall storage cost by being clever +> about reuse, looks sane. A lot of what they have to say in +> https://dvc.org/doc/understanding-dvc rings true. + +Check out the full blog here: + + + +One of the areas that DVC is growing into is continuous integration and +continuous deployment (CI/CD), a part of the nascent field of MLOps. Naturally, +we were thrilled to discover that CI/CD with DVC is taught in a new Packt book, +["Learn Python by Building Data Science Applications"](https://www.packtpub.com/programming/learn-python-by-building-data-science-applications) +by David Katz and Philipp Kats. + +In the authors words, the goal of this book is to teach data scientists and +engineers "not only how to implement Python in data science projects, but also +how to maintain and design them to meet high programming standards." Needless to +say, we are considering starting a book club. Grab a copy here: + + + +Last year in Mexico, DVC contributor Ramón Valles gave a talk about reproducible +machine learning workflows at Data Day Monterrey—and +[a video of his presentation](https://www.youtube.com/watch?v=tAxG-n20Di4) is +now online! In this Spanish-language talk, Ramón gives a thorough look at DVC, +particularly building pipelines for reproducible ML. + + + +Finally, DVC data scientist Elle (that's me!) released a new public dataset of +posts from the Reddit forum +[r/AmItheAsshole](https://reddit.com/r/amitheasshole), and reported some +preliminary analyses. We're inviting anyone and everyone to play with the data, +make some hypotheses and share their findings. Check it out here: + + + +That's all for now—thanks for reading, and be in touch on our +[GitHub](https://github.com/iterative/dvc), +[Twitter](https://twitter.com/dvcorg), and +[Discord channel](https://dvc.org/chat). diff --git a/content/blogs/2020-03-24-march-20-community-gems.md b/content/blogs/2020-03-24-march-20-community-gems.md new file mode 100644 index 0000000000..377a1e5fb1 --- /dev/null +++ b/content/blogs/2020-03-24-march-20-community-gems.md @@ -0,0 +1,133 @@ +--- +title: March '20 Community Gems +date: 2020-03-12 +description: > + Great discussions and technical Q&A's from our users. +descriptionLong: > + Look here every month for great discussions and technical Q&A's from our users + and core development team. +picture: 2020-03-12/march_20_header.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/march-20-community-gems/336 +tags: + - Tags + - Community Gems + - Data registry +--- + +## Discord gems + +Here are some Q&A's from our Discord channel that we think are worth sharing. + +### Q: I have several simulations organized with Git tags. I know I can compare the metrics with `dvc metrics diff [a_rev] [b_rev]`, substituting hashes, branches, or tags for [a_rev] and [b_rev]. [But what if I wanted to see the metrics for a list of tags?](https://discordapp.com/channels/485586884165107732/563406153334128681/687634347104403528) + +DVC has a built in function for this! You can use `dvc metrics show` with the +`-T` option: + +```dvc +$ dvc metrics show -T +``` + +to list the metrics for all tagged experiments. + +Also, we have a couple of relevant discussions going on in our GitHub repo about +[handling experiments](https://github.com/iterative/dvc/issues/2799) and +[hyperparameter tuning](https://github.com/iterative/dvc/issues/3393). Feel free +to join the discussion and let us know what kind of support would help you most. + +### Q: [Is there a recommended way to save metadata about the data in a `.dvc` file?](https://discordapp.com/channels/485586884165107732/563406153334128681/685105104340386037) In particular, I'd like to save summary statistics (e.g., mean, minimum, and maximum) about my data. + +One simple way to keep metadata in a `.dvc` file is by using the `meta` field. +Each `meta` entry is a `key:value` pair (for example, `name: Jean-Luc`). The +`meta` field can be manually added or written programmatically, but note that if +the `.dvc` file is overwritten (perhaps by `dvc run`, `dvc add`, or +`dvc import`) these values will not be preserved. You can read more about this +[in our docs](https://dvc.org/doc/user-guide/project-structure). + +Another approach would be to track the statistics of your dataset in a metric +file, just as you might track performance metrics of a model. For a tutorial on +using DVC metrics please +[see our docs](https://dvc.org/doc/command-reference/metrics). + +### Q: My team has been using DVC in production. When we upgraded from DVC version 0.71.0, we started getting an error message: `ERROR: unexpected error - /my-folder is not a git repository`. [What's going on?](https://discordapp.com/channels/485586884165107732/485596304961962003/687403454989467650) + +This is a consequence of new support we've added for monorepos with the +`dvc init --subdir` functionality +([see more here](https://dvc.org/doc/command-reference/init#init)), which lets +there be multiple DVC projects within a single Git repository. Now, if a DVC +repository doesn't contain a `.git` directory, DVC expects the `no_scm` flag to +be present in `.dvc/config` and raises an error if not. For example, one of our +users reported this when using DVC to pull files into a Docker container that +didn't have Git initialized (for more about using DVC without Git, +[see our docs](https://dvc.org/doc/command-reference/init#initializing-dvc-without-git)). + +You can fix this by running `dvc config core.no_scm true` (you could include +this command in the script that creates Docker images). Alternately, you could +include `.git` in your Docker container, but this is not advisable for all +situations. + +We are currently working to +[add graceful error-handling](https://github.com/iterative/dvc/issues/3474) for +this particular issue so stay tuned. + +### Q: [Is there a way to force the pipeline to rerun, even if its dependencies haven't changed?](https://discordapp.com/channels/485586884165107732/563406153334128681/687422002822381609) + +Yes, `dvc repro` has a flag that should help here. You can use the `-f` or +`--force` flag to reproduce the pipeline even when no changes in the +dependencies (for example, a training datset tracked by DVC) have been found. So +if you had a hypoethetical DVC pipeline whose final process was `deploy.dvc`, +you could run `dvc repro -f deploy.dvc` to rerun the whole pipeline. + +### Q: What's the best way to organize DVC repositories if I have several training datasets shared by several projects? Some projects use only one dataset while other use several. [Can one project have `.dvc` files corresponding to different remotes?](https://discordapp.com/channels/485586884165107732/563406153334128681/670664813973864449) + +Yes, one project directory can contain datasets from several different DVC +remotes. Specifically, DVC has functions `dvc import` and `dvc get` that emulate +the experience of using a package manager for grabbing datasets from external +sources. You can use `dvc import` or `dvc get` to access any number of datasets +that are dependencies in a given project. For more on this, +[see our tutorial on data registries](https://dvc.org/doc/use-cases/data-registries). + +### Q: [What are the risks of using DVC on confidential data?](https://discordapp.com/channels/485586884165107732/563406153334128681/689848196473684024) + +DVC doesn't collect any information about your data (or code, or models, for +that matter). You may have noticed that DVC +[collects Anonymized Usage Analytics](https://dvc.org/doc/user-guide/analytics), +which users may +[opt out of](https://dvc.org/doc/user-guide/analytics#opting-out). The data we +collect is extremely limited and anonymized, as it is collected mainly for the +purpose of prioritizing bugs and feature development based on DVC usage. For +example, we collect info about your operating system, DVC version, and +installation method (the +[complete list of collected features is here](https://dvc.org/doc/user-guide/analytics#what)). + +Many of our users work with sensitive or private data, and we've developed DVC +with such scenarios in mind from day one. + +### Q: [Can you suggest a reference architecture for using DVC as part of MLOps?](https://discordapp.com/channels/485586884165107732/563406153334128681/683890642631524392) + +Increasingly, DVC is being used not to just to version and manage machine +learning projects, but as part of MLOps, _practices for combining data science +and software engineering_. As MLOps is a fairly new discipline, standards and +references aren't yet solidified. So while there isn't (_yet_) a standard recipe +for using DVC in MLOps projects, we can point you to a few architectures we +like, and which have been reported in sufficient detail to recreate. + +First, DVC can be used to detect events (such as dataset changes) in a CI/CD +system that traditional version control systems might not be able to. An +excellent and thorough +[blog by Danilo Sato et al.](https://martinfowler.com/articles/cd4ml.html) +explores using DVC in this way, as part of a CI/CD system that retrains a model +automatically when changes in the dataset are detected. + +Second, DVC can be used to support model training on cloud GPUs, particularly as +a tool for pushing and pulling files (such as datasets and trained models) +between cloud computing instances, DVC repositories, and other environments. +This architecture was the subject of a +[recent blog by Marcel Mikl and Bert Besser](https://blog.codecentric.de/en/2020/01/remote-training-gitlab-ci-dvc/). +Their report describes the cloud computing setup and continuous integration +pipeline quite well. + +If you develop your own architecture for using DVC in MLOps, please keep us +posted. We'll be eager to learn from your experience. Also, keep an eye on our +blog in the next few months. We're rolling out some new tools with a focus on +MLOps! diff --git a/content/blogs/2020-03-31-reimagining-devops-video.md b/content/blogs/2020-03-31-reimagining-devops-video.md new file mode 100644 index 0000000000..7fab1700f5 --- /dev/null +++ b/content/blogs/2020-03-31-reimagining-devops-video.md @@ -0,0 +1,50 @@ +--- +title: 'New Video! 🎥 Reimagining DevOps for Machine Learning' +date: 2020-03-31 +description: > + A talk about CI/CD with fuzzy animals. +descriptionLong: > + As machine learning matures, we need to find better ways to integrate data + science with software development. In this talk for DivOps, a conference about + the future of DevOps, DVC data scientist Elle O'Brien discusses how CI/CD can + adapt to machine learning. This is MLOps, explained with fuzzy animals. +picture: 2020-03-31/cover_image.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/new-video-reimagining-devops-for-machine-learning/341 +tags: + - CI/CD + - DevOps + - MLOps + - DivOps + - Company +--- + +Last week, DVC was part of [DivOps](https://divops.org/), a fully remote +conference led by women in DevOps. DevOps, to the newly anointed, is a +discipline bringing together strong software engineering practices with speedy +development cycles. As machine learning is finding its way into just about +_every_ area of research and development, we're going to need to come up with +some conventions and tools for integrating machine learning and big data with +software development. This growing field is called +[MLOps](https://towardsdatascience.com/the-rise-of-the-term-mlops-3b14d5bd1bdb). + +I gave a lightning talk about how we'll have to rethink our software development +practices in the age of machine learning. It's got a focus on +[CI/CD](https://martinfowler.com/articles/cd4ml.html), a way of structuring +workflows that we think can streamline exchanges between data scientists and +software engineers. And, it's got fuzzy animals. Check it out here: + +https://youtu.be/0MDrZpO_7Q4 + +If you liked this, you'll also want to check out the next talk in the DivOps +playlist by +[Anna Petrovicheva](https://www.linkedin.com/in/anna-petrovicheva-44b24673/), +Founder and CEO of Xperience AI. Anna's talk goes deeper into developing best +practices for software engineering with deep learning. + +https://youtu.be/8nwpCQufeE0 + +All the talks from DivOps are +[available online now](https://www.youtube.com/playlist?list=PLVeJCYrrCemgbA1cWYn3qzdgba20xJS8V), +so please check out the YouTube channel. And stay tuned on our blog for more +CI/CD discussions coming soon... diff --git a/content/blogs/2020-04-06-april-20-dvc-heartbeat.md b/content/blogs/2020-04-06-april-20-dvc-heartbeat.md new file mode 100644 index 0000000000..179a7ce59a --- /dev/null +++ b/content/blogs/2020-04-06-april-20-dvc-heartbeat.md @@ -0,0 +1,171 @@ +--- +title: April ’20 DVC❤️Heartbeat +date: 2020-04-06 +description: > + Catch up on new DVC releases, talks, and projects in our community. This + month, learn what we're up to in MLOps, CI/CD, and the intersection of data + science and software engineering. + +descriptionLong: > + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. + + Look here for updates about [DVC](https://dvc.org), our journey as a startup, + projects by our users and big ideas about best practices in ML and data + science. +picture: 2020-04-06/april_header.png +pictureComment: | + A view from + [Barrancas del Cobre](https://en.wikipedia.org/wiki/Copper_Canyon), shot by + Jorge Orpinel Pérez. Jorge has mastered the art of working on DVC remotely. +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/april-20-heartbeat/347 +tags: + - Heartbeat + - Google Drive + - MLOps + - CI/CD + - Podcast + - DivOps +--- + +Welcome to the April Heartbeat, our +[monthly roundup of cool happenings](https://dvc.org/blog/tags/heartbeat), good +reads and other bright spots in our community. + +## News + +**Adapting to the pandemic.** Although the world seems different than when we +posted last month, the DVC community is steady and strong. As a predominantly +distributed company, we've been developing our infrastructure for remote work +from the get-go. It isn't always _easy_ to schedule an all-hands meeting across +9 time zones but we make it work. This experience has prepared us well for the +COVID-19 pandemic: although there are new challenges (like caring for families +while working from home) we've been able to weather the transition to fully +remote work relatively well. + +![](../uploads/images/2020-04-06/laptop_on_boat.jpeg)_Before social distancing +started, DVC technical writer Jorge Orpinel Pérez has worked from a canoe. Check +out more photos from his workations +[on Instagram](https://www.instagram.com/workationer/)._ + +**DVC sponsors DivOps.** In a time when many conferences are going remote out of +necessity, we were fortunate to be part of an _intentionally_ remote conference +this month! We sponsored [DivOps](https://divops.org/), a fully-online meeting +led by women in DevOps. The DivOps lineup included speakers from GitHub, +DropBox, Gremlin and more. DVC data scientist Elle (that's me!) gave a +ten-minute talk about MLOps and CI/CD, so +[please check out the video](https://dvc.org/blog/reimagining-devops-video). +Another very relevant talk was from Anna Petrovicheva, CEO of +[Xperience AI](http://xperience.ai/); Anna +[spoke about her team's development workflow for deep learning projects](https://youtu.be/8nwpCQufeE0) +and gave a clear overivew of how they use DVC. + +**DVC on the airwaves.** In early March, Elle was interviewed on an episode of +[The Data Stream podcast](https://www.interviewquery.com/tag/podcast/) about a +DVC data science project, +[building a public dataset of posts](https://dvc.org/blog/a-public-reddit-dataset) +from the "Am I the Asshole?" subreddit. + + + +## New releases + +This month, DVC has +[released some new features](https://github.com/iterative/dvc/releases) and +updates: + +- Did you know you can use Google Drive for remote storage with DVC? We've been + hard at work delivering the best performance with Google Drive and are + thrilled to invite users to try it out. Brand new + [docs](https://dvc.org/doc/user-guide/setup-google-drive-remote#setup-a-google-drive-dvc-remote) + explain how to get started. +- We're introducing the `metrics diff` functionality, which lets you compare + metrics from different commits side-by-side + ([check out the docs](https://dvc.org/doc/command-reference/metrics/diff) to + learn more) +- Windows users, we are here for you. Contributor + [rxxg](https://github.com/rxxg) helped us get better performance on copy + operations in Windows. + +## From the community + +**DVC and R working together** One of our favorite blogs this month came from +Marcel Ribeiro-Dantas, a developer and PhD student at the +[Institut Curie](https://institut-curie.org/). Marcel wrote about using DVC to +manage projects in R, particularly defining and versioning pipelines of data +processing and analysis that can be reproduced easily. While DVC is language +agnostic, much of our user content has been Python-centric, so it's exciting to +see a detailed post for the R-using data scientist (for more about R with DVC, +see +[Marija Ilić's post](https://dvc.org/blog/r-code-and-reproducible-model-development-with-dvc))! + + + +Also, Marcel recently gave an interview on +[The Data Hackers Podcast](https://medium.com/data-hackers/health-data-e-o-coronav%C3%ADrus-data-hackers-podcast-22-2b059d460cb1), +a Portuguese-language show. Listen for a shout-out about DVC! + +**DVC is in another book!** Last month we reported that DVC is part of a Packt +book, +["Learn Python by Building Data Science Applications"](https://www.packtpub.com/programming/learn-python-by-building-data-science-applications). +This month, DVC got a mention in a just-released O'Reilly book, +["Building Machine Learning Pipelines"](https://www.oreilly.com/library/view/building-machine-learning/9781492053187/) +by Hannes Hapke and Catherine Nelson. + + + +**Some more links we like.** Here are a few other discussions that have caught +our attention. + +- **MLOps can be fun.** Jeroen France's blog, "MLOps: Not as boring as it + sounds!", reads like a "coming of age" story about embracing engineering as a + data scientist. It's part-motivational, part tutorial- definitely worth a + read. Here's a sample: + + > No-one wants to baby-sit, maintain, and troubleshoot their own models once + > they are in production. Every data scientist secretly hopes they can pawn + > that job off to an engineering team, or maybe an intern, right? Well, in + > fact MLOps is going to make your data science life a lot better. + +- **Leveling up your Jupyter notebooks.** In a series called + ["How to Use Jupyter Notebooks in 2020"](https://ljvmiranda921.github.io/notebook/2020/03/16/jupyter-notebooks-in-2020-part-2/), + Lj Miranda discusses how to use Jupyter Notebooks in a mature software + development workflow. He makes several recommendations for tools, including + DVC. + +- **Reddit discussion about CI/CD** When we shared around our DivOps conference + presentation on Reddit, some + [great discussion happened](https://www.reddit.com/r/MachineLearning/comments/fshh9p/p_a_talk_about_adapting_cicd_systems_for_ml_full/). + We chatted about how CI/CD might work for data scientists, who often begin a + project with a phase of rapid exploration, and what version control for ML + could look like without Git. + +- **Smashing the data monolith.** Engineer Juan López López wrote a blog called + ["A complete guide about how to break the data monolith"](https://medium.com/packlinkeng/a-complete-guide-about-how-to-break-the-data-monolith-caa2ab2d01f6), + which is a neat manifesto about treating infrastructure _and_ data as code. + It's got nice coverage of DVC, code examples, and some deeply enjoyable + artwork. + +![](../uploads/images/2020-04-06/monolith.jpeg)_From Juan Juan López López's +[blog](https://medium.com/packlinkeng/a-complete-guide-about-how-to-break-the-data-monolith-caa2ab2d01f6)._ + +Thanks for reading. As always, let us know what you're making with DVC and what +links are catching your interest in the blog comments, on +[Twitter](https://twitter.com/DVCorg), and our +[Discord channel](https://dvc.org/chat). Be safe and be in touch! diff --git a/content/blogs/2020-04-16-april-20-community-gems.md b/content/blogs/2020-04-16-april-20-community-gems.md new file mode 100644 index 0000000000..a7fed015be --- /dev/null +++ b/content/blogs/2020-04-16-april-20-community-gems.md @@ -0,0 +1,151 @@ +--- +title: April '20 Community Gems +date: 2020-04-16 +description: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + the DVC cache, pipelines, cloud storage options and concurrency. +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + the DVC cache, cloud storage options and concurrency. +picture: 2020-04-16/DVC_Gems_April_20.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/april-20-community-gems/356 +tags: + - Community Gems + - Pipelines +--- + +## Discord gems + +Here are some Q&A's from our Discord channel that we think are worth sharing. + +### Q: [How can I view and download files that are being tracked by DVC in a repository?](https://discordapp.com/channels/485586884165107732/485596304961962003/698815826870009868) + +To list the files that are currently being tracked in a project repository by +DVC and Git, you can use `dvc list`. This will display the contents of that +repository, including `.dvc` files. To download the contents corresponding to a +particular `.dvc` file, use `dvc get`: + +Let's consider an example using both functions. Assume we're working with DVC's +data registry example repository. To list the files present, run: + +```dvc +$ dvc list -R https://github.com/iterative/dataset-registry +.gitignore +README.md +get-started/.gitignore +get-started/data.xml +get-started/data.xml.dvc +... +``` + +Note that the `-R` flag, which enables `dvc list` to display the contents of +directories inside the repository. Now assume you want to download `data.xml`, +which we can see is being tracked by DVC. To download the dataset to your local +workspace, you would then run + +```dvc +$ dvc get https://github.com/iterative/dataset-registry get-started/data.xml +``` + +For more examples and information, +[see the documents](https://dvc.org/doc/command-reference/list#list) for +`dvc list` and for [`dvc get`](https://dvc.org/doc/command-reference/get). + +### Q: [I'm setting up cloud remote storage for DVC and I'd like to forbid `dvc gc --cloud` so users can't accidently delete files in the remote. Will it be sufficient to restrict deletion in the remote's settings?](https://discordapp.com/channels/485586884165107732/563406153334128681/698116671298076672) + +You're right to be careful, because `dvc gc --cloud` can be dangerous in the +wrong hands- it'll remove any unused files in your remote (for more info, +[see our docs](https://dvc.org/doc/command-reference/gc#gc)). To prevent users +from having this power, setting your bucket policy to block object deletions +should do the trick. How to do this will depend on your cloud storage provider- +we found some relevant docs for +[GCP](https://cloud.google.com/iam/docs/understanding-roles#cloud_storage_roles), +[S3](https://docs.aws.amazon.com/AmazonS3/latest/dev/using-with-s3-actions.html), +and +[Azure](https://docs.microsoft.com/en-us/azure/storage/common/storage-auth-aad). +For the full list of supported remote storage types, +[see here](https://dvc.org/doc/command-reference/remote/add#supported-storage-types). + +### Q: [My team is interested in DVC, and we have all of our data in remote storage. Do we need to install a centralised enterprise version of DVC on a dedicated server? And do we have to also have a GitHub repository?](https://discordapp.com/channels/485586884165107732/563406153334128681/692524884701478992) + +There's no need for a DVC server. Our remote storage works on top of +[most kinds of cloud storage by default](https://dvc.org/doc/command-reference/remote/add#supported-storage-types), +including S3, GCP, Azure, Google Drive, and Aliyun, with no additional +infrastructure required. As for GitHub (or BitBucket, or GitLab, etc.), this is +only needed if you're interested in sharing your project with others over that +channel. We _like_ sharing projects on GitHub, but you don't have to. Any Git +repository, even a local one, will do. + +So a "minimal" DVC project for you might consist of a local workspace with Git +enabled (which you _do_ need), a local Git repository, and your S3 remote +storage. Check out our +[use cases](https://dvc.org/doc/use-cases/versioning-data-and-model-files) to +see some examples of infrastructure and workflow for teams. + +### Q: [Could there be any issues with concurrent `dvc push`-es to the same remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/680053750320332800) + +There are a few ways for concurrency to occur: multiple jobs running in parallel +on the same machine, or different users on different machines. But in any case, +the answer is the same: there's nothing to worry about! When pushing a file to a +DVC remote, all operations are non-destructive and atomic. + +### Q: [How do I only download part of my remote repository? For example, I only need the final output of my pipeline, not the raw data or intermediate steps.](https://discordapp.com/channels/485586884165107732/485596304961962003/696751934777852004) + +We support granular operations on DVC project repositories! Say your project's +DVC remote contains several `.dvc` files corresponding to different stages of +your pipeline: `0_process_data.dvc`, `1_split_test_train.dvc`, and +`2_train_model.dvc`. If you're only interested in the files output by the final +stage of the pipeline (`2_train_model.dvc`), you can run: + +```dvc +$ dvc pull process_data_stage.dvc +``` + +You can also use `dvc pull` at the level of individual files. This might be +needed if your DVC pipeline file creates 10 outputs, for example, and you only +want to pull one (say, `model.pkl`, your trained model) from remote DVC storage. +You'd simply run + +```dvc +$ dvc pull model.pkl +``` + +### Q: [How can I remove a `.dvc` file, but keep the associated files in my workspace?](https://discordapp.com/channels/485586884165107732/485596304961962003/689827778358673469) + +Sometimes, you realize you don't want to put a file under DVC tracking after +all. That's okay, easy to fix. Simply remove the `.dvc` file like any other- +`rm .dvc`. DVC will then stop tracking the file, and the associated target +file will still be in your local workspace. Note that the file will still be in +your +[DVC cache](https://dvc.org/doc/user-guide/dvc-internals#structure-of-cache-directory) +unless you clear it with `dvc gc`. + +### Q: [I'm trying to move a stage file with `dvc move`, but I'm getting an error. What's going on?](https://discordapp.com/channels/485586884165107732/563406153334128681/685125650901630996) + +The `dvc move` command is used to rename a file or directory and simultaneously +modify its corresponding DVC file. It's handy so you don't rename a file in your +local workspace that's under DVC tracking without updating DVC to the change +(see an [example here](https://dvc.org/doc/command-reference/move#description)). +The function doesn't work on +["stage files"](https://dvc.org/doc/tutorials/pipelines#define-stages) from DVC +pipelines. There's not currently an easy way to safely move `dvc.yaml` files, +and it's an +[open issue we're working on](https://github.com/iterative/dvc/issues/1489). +Until then, you can manually update `dvc.yaml`, or make a new one in the desired +location. + +### Q: [I just starting using DVC and noticed that when I `dvc push` files to remote cloud storage, the directory in my remote looks like my DVC cache, not my local workspace directory. Is this right?](https://discordapp.com/channels/485586884165107732/485596304961962003/693740598498426930) + +Yep, that's exactly how it should be! In order to provide deduplication and some +other optimizations, your DVC remote's directory structure will mirror the DVC +cache (which is by default in your local workspace under `.dvc/cache`). +Effectively, DVC uses your Git repository to store DVC files, which are keys for +cache files on your remote. So looking inside your remote won't be particularly +enlightening if you're looking for human-readable filenames- the file names will +look like hashes (because, well, they are). Luckily, DVC handles all the +conversions between the filenames in your local workspace and these hashes. + +To get some more intuition about this, check out some of our +[docs](https://dvc.org/doc/user-guide/dvc-internals) about how DVC organizes +files. diff --git a/content/blogs/2020-04-30-gsod-ideas-2020.md b/content/blogs/2020-04-30-gsod-ideas-2020.md new file mode 100644 index 0000000000..84ea27400a --- /dev/null +++ b/content/blogs/2020-04-30-gsod-ideas-2020.md @@ -0,0 +1,269 @@ +--- +title: Join DVC for Google Season of Docs 2020 +date: 2020-04-30 +description: > + A call for writer applications for Google Season of Docs 2020. +descriptionLong: > + DVC is looking for technical writers to take part in [Google Season of Docs + 2020](https://developers.google.com/season-of-docs) — a unique program + sponsored by Google that pairs technical writers with open source projects. In + this post we introduce our goals for the program, and specific [project + ideas](#project-ideas) for potential candidates. +picture: 2019-04-23/post-image.png +pictureComment: +author: jorge_orpinel +commentsUrl: https://discuss.dvc.org/t/join-dvc-for-google-season-of-docs-2020/375 +tags: + - Google Season of Docs + - Documentation + - Mentoring + - Company +--- + +After a successful experience with the first edition of **Google Season of +Docs** [in 2019](/blog/dvc-project-ideas-for-google-summer-of-docs-2019), we're +putting out a call for writers to apply to work with DVC as part of the +[2020 edition](https://developers.google.com/season-of-docs). If you want to +write open source software documentation with mentorship from our team, read on. + +**TLDR**: Skip to [project ideas](#project-ideas). + +[DVC](https://dvc.org/) has a dedicated docs team and a +[well-defined process](https://dvc.org/doc/user-guide/contributing/docs) for +creating and maintaining our documentation, modeled in part based on our past +GSoD experience. We are happy to share our experience, introduce technical +writers to the world of open source and machine learning best practices, and +work together on improving our documentation. + +## Previous experience + +In last year's Season, we matched with prolific writer +[Dashamir](https://github.com/dashohoxha), who helped us give proper structure +important part of our docs, and address key issues. + +https://twitter.com/DVCorg/status/1205203662827483136 + +Some of our achievements together were: + +- Reorganized our [tutorials](https://github.com/iterative/dvc.org/pull/666) and + core [contribution guide](https://github.com/iterative/dvc.org/pull/726) +- Created [interactive lessons](https://github.com/iterative/dvc.org/issues/546) + on [Katacoda](https://www.katacoda.com/dvc) +- Docs [cleanup](https://github.com/iterative/dvc.org/pull/734) +- Suggested the creation of a + [How To](https://github.com/iterative/dvc.org/issues/563) section for our docs +- Other + [contributions](https://github.com/iterative/dvc.org/pulls?q=is%3Apr+is%3Aclosed+author%3Adashohoxha) + +Another collaborator we connected with via GSoD’19 was an amazing student +intern, [Aman](https://github.com/algomaster99). He helped us address +[dozens of tickets](https://github.com/iterative/dvc.org/pulls?q=is%3Apr+author%3Aalgomaster99+is%3Aclosed) +related to our Node.js docs web app. For example: + +- Contributed to our + [command reference](https://github.com/iterative/dvc.org/pull/315) and + [user guide](https://github.com/iterative/dvc.org/pull/366), and created a + much needed + [documentation contribution](https://github.com/iterative/dvc.org/pull/317) + guide + +- [Formatted](https://github.com/iterative/dvc.org/pull/328) the source code of + our docs and established an + [automated mechanism](https://github.com/iterative/dvc.org/pull/386) to + enforce pretty formatting going forward + +- Implemented super useful hovering tooltips based on a special + [glossary](https://github.com/iterative/dvc.org/pull/431): + + ![](../uploads/images/2020-04-30/tooltip.png) _Toolip in the `dvc remote` + command reference_ + +### Community outreach + +More positive results of the program included talks and meetups organized by our +open source contributors, with our mentorship: + +![](../uploads/images/2020-04-30/SciPy_India_Aman.png) _Our intern Aman took a +several-hour long train ride to +[talk](https://static.fossee.in/scipy2019/SciPyTalks/SciPyIndia2019%5FS011%5FStoring%5Fa%5Ffew%5Fversions%5Fof%5Fa%5F5GB%5Ffile%5Fin%5Fa%5Fdata%5Fscience%5Fproject%5F20191130.mp4) +at [SciPy India 2019](https://scipy.in/2019)._ + +Another star contributor who found our project via GSoD, +[Kurian](https://github.com/kurianbenoy), closed +[several tickets](https://github.com/iterative/dvc.org/issues?q=is%3Aissue+kurianbenoy), +produced a DVC intro tutorial in +[Kaggle](https://www.kaggle.com/kurianbenoy/introduction-to-data-version-control-dvc) +and +[Colab](https://colab.research.google.com/drive/1O1XmUZ8Roj1dFxWTrpE55_A7lVkWfG04), +and ended up giving a talk in +[PyCon India](https://in.pycon.org/cfp/2019/proposals/machine-learning-model-and-dataset-versioning~dRqRb/): + +https://www.youtube.com/watch?v=Ipzf6oQqQpo + +He also covered DVC for the +[Devsprints](https://kurianbenoy.github.io/2019-11-03-Devsprints%5Fexperience/) +of [MEC.conf](https://enotice.vtools.ieee.org/public/50448) + +https://twitter.com/FossMec/status/1192866498324254720 + +Yet another outstanding contributor, +[Nabanita](https://twitter.com/explorer_07), ended up organizing a DVC-themed +hackathon later that year: + +https://twitter.com/psociiit/status/1185150096792535040 + +## Prerequisites to apply + +Besides the general requirements to apply to Google Season of Docs, there are a +few skills we look for in applicants. + +1. **Clear English writing.** We strive express the concepts, processes, and + details around DVC clearly, correctly, and completely. We use general and + friendly wording as much as possible and pay close attention to consistency + in our terminology. Our team will help with copy editing. + +1. **Command line experience.** [DVC](https://dvc.org/doc) is a command line + tool that builds on top of [Git](https://git-scm.com/), so being able to play + with it and test the features will be very useful. Creating and managing + files, GNU/Linux commands, file and permission administration are desired + skills. + +1. **People skills.** We put a high value on communication: the ability to + discuss ideas, explain your goals, report progress, and work kindly with more + or less technical teammates. + +If you like our mission but aren't sure if you're sufficiently prepared, please +be in touch anyway. We'd love to hear from you. + +## Project ideas + +Below are several project ideas that are an immediate priority for the DVC docs +team. We welcome technical writers to create their own proposals, even if they +differ from our ideas. Most projects will be mentored primarily by our lead +technical writer, [Jorge](https://github.com/jorgeorpinel). + +1. **"How To" section.** Other than our + [use cases](https://dvc.org/doc/use-cases), we still lack a good place to + answer common questions in our docs (think FAQ). We have compiled + [set of topics](https://github.com/iterative/dvc.org/issues/899) that we + think would be best explained in a new **How To** section for this purpose. + + This project would imply relocating bits and pieces of info from existing + docs into new how-tos, as well as writing significant new material to + complete them. Expanding on our + [troubleshooting](https://dvc.org/doc/user-guide/troubleshooting) page would + probably go well as part of this project as well. + + _Difficulty rating:_ Beginner-Medium

+ +1. **DVC 1.0 docs.** We are soon to release DVC 1.0.0! This version brings some + significant changes that for the first time in our + [release history](https://github.com/iterative/dvc/releases) are not + completely backward-compatible. We expect that fully updating all our + previous docs will take a few months, and you could help us with this! The + main new features are listed below. + + > UPDATE: See [post](https://dvc.org/blog/dvc-3-years-and-1-0-release) about + > the release! And corresponding docs + > [epic](https://github.com/iterative/dvc.org/issues/1255) task + + - A + [multi-stage _pipelines file_](https://github.com/iterative/dvc/issues/1871) + that partially substitutes + [DVC files](https://dvc.org/doc/user-guide/dvc-files) + - Separation between + [scalar vs. continuous metrics](https://github.com/iterative/dvc/issues/3409), + and new commands to visualize them, such as `dvc plots` + - A new [run cache](https://github.com/iterative/dvc/issues/1234) that + automatically saves experiment checkpoints between commits + + _Difficulty rating:_ Beginner-Medium

+ +1. **Video tutorials.** Written documentation is great, but other media can also + be important for our organization to reach a wide variety of learners. + Expanding to video is also a core part of our developer advocacy strategy. + + One of DVC's priorities for this year is creating a library of video + tutorials ranging from short explanations of basic DVC functions to more + advanced use cases. You could assist in writing the scripts or even take the + lead producing some videos, so image/video editing skills would come in handy + (optional). + + ![](../uploads/images/2020-04-30/Discord_user_video_tutorials.png) _Video + tutorials are a common request by users in our [chat](https://dvc.org/chat)._ + + **Mentor**: [Elle](https://github.com/elleobrien) + + _Difficulty rating:_ Beginner-Medium

+ +1. **Interactive guides.** Many of our docs include command line examples to + illustrate how DVC works. In some cases these are full guides we want people + to be able to follow by copying commands into their terminals. This has a few + challenges: mainly keeping the rest of the document maintainable, brief, and + easy to read; and supporting people on all platforms (Mac, Windows, Linux). + + So we started extracting some of the command examples into interactive + [Katacoda scenarios](https://www.katacoda.com/dvc) to match certain docs, + however they are in need of maintenance and completion, as well as being + embedded into the corresponding pages per + [this issue](https://github.com/iterative/dvc.org/issues/670). + + This may involve working with our front-end team or, preferably, having some + Javascript coding experience. + + _Difficulty rating:_ Medium-Advanced + +1. **Javascript engine UI/UX.** Our website has custom + [source code](https://github.com/iterative/dvc.org/tree/main/src) we've + developed over the years to host our landing pages, docs, and blog all in a + high-performance, advanced static site (Node.js, Gatsby, React, Typescript). + We have several goals to further improve the usability and structure of our + site, such as: + + - Creating a + [special docs home page](https://github.com/iterative/dvc.org/issues/1073) + - Improving [mobile menus](https://github.com/iterative/dvc.org/issues/808) + - Better navigation sidebar + [highlighting](https://github.com/iterative/dvc.org/issues/753) and + [positioning](https://github.com/iterative/dvc.org/issues/1198) + - Other + [doc-engine](https://github.com/iterative/dvc.org/issues?q=is%3Aopen+is%3Aissue+label%3Adoc-engine) + and + [blog-engine](https://github.com/iterative/dvc.org/issues?q=is%3Aopen+is%3Aissue+label%3Ablog-engine) + issues + + _Difficulty rating:_ Medium-Advanced

+ +1. **SEO/ Site Analytics.** Our current website analytics are somewhat basic. We + will need to have a clear strategy to follow and improve our Search Engine + results (with meta content, media optimization, + [etc.](https://github.com/iterative/dvc.org/issues?q=is%3Aissue+is%3Aopen+seo)), + as well as to understand the behavior of our users to improve their + experience. The specifics of the project are left for the applicant to + suggest! This should be relatively simple for someone with proven experience + in SEO or website QA. + + What tools should we employ? (e.g. Google Analytics, etc.) What trends and + reports do we need to focus on? What kinds of users do we have and what + interaction flows do they each follow? Can we semi-identify these users + and/or cross-examine their data with DVC + [usage analytics](https://dvc.org/doc/user-guide/analytics)? Let's come up + with a plan to answer these and other related questions! + + _Difficulty rating:_ Beginner-Medium

+ +> For more inspiration, feel free to review our +> [epics](https://github.com/iterative/dvc.org/labels/epic) and other open docs +> [issues](https://github.com/iterative/dvc.org/issues?q=is%3Aopen+is%3Aissue+label%3Adoc-content+). + +## If you'd like to apply + +Please refer to the +[Google Season of Docs](https://developers.google.com/season-of-docs) +application guides for specifics of the program. Writers looking to know more +about DVC, and our worldwide community of contributors, will learn most by +visiting our [Discord chat](https://dvc.org/chat), +[GitHub repository](https://github.com/iterative/dvc), and +[Forum](https://discuss.dvc.org/). We are available to discuss project proposals +from interested writers and can be reached by [email](mailto:support@dvc.org) or +on Discord. diff --git a/content/blogs/2020-05-04-dvc-3-years-and-1-0-release.md b/content/blogs/2020-05-04-dvc-3-years-and-1-0-release.md new file mode 100644 index 0000000000..542771d9d8 --- /dev/null +++ b/content/blogs/2020-05-04-dvc-3-years-and-1-0-release.md @@ -0,0 +1,245 @@ +--- +title: DVC 3 Years 🎉 and 1.0 Pre-release 🚀 +date: 2020-05-04 +description: > + Today, we've got three big announcements: 🎉 3rd-year anniversary of DVC, 🚀 + DVC 1.0 pre-release is ready and ⭐ 5000 GitHub starts. + +descriptionLong: | + Today, we've got three big announcements. + + 🎉 3rd-year anniversary of DVC + + 🚀 DVC 1.0 pre-release is ready + + ⭐ DVC has reached 5K GitHub starts (coincidentally on the same day) + + We'll share what we've learned from our journey, how users helped for the new + release and how DVC is growing. +picture: 2020-05-04/owl.png +pictureComment: DVC 3rd-year anniversary +author: dmitry_petrov +commentsUrl: https://discuss.dvc.org/t/dvc-3-years-anniversary-and-1-0-pre-release/374 +tags: + - Release + - MLOps + - DataOps + - CI/CD +--- + +## 3 years anniversary! + +Three years ago on **May 4th, 2017**, I published the +[first blog post about DVC](https://www.kdnuggets.com/2017/05/data-version-control-iterative-machine-learning.html). +[The first DVC discussion on Reddit](https://www.reddit.com/r/Python/comments/698ian/dvc_data_scientists_collaboration_and_iterative/). +Until that point, DVC was a private project between +[myself](https://github.com/dmpetrov) and [Ruslan](https://github.com/efiop). +Today, things look very different. + +Today, DVC gets recognized at professional conferences: people spot our logo, +and sometimes even our faces, and want to chat. There's much more content about +DVC coming from bloggers than from inside our organization. We're seeing more +and more job postings that list DVC as a requirement, and we're showing up in +[data science textbooks](https://www.amazon.com/Learn-Python-Building-Science-Applications/dp/1789535360). +When we find a new place DVC is mentioned, we celebrate in our Slack - we've +come a long way! + +The data science and ML space is fast-paced and vibrant, and we're proud that +DVC is making an impact on discussions about best practices for healthy, +sustainable ML. Every week, we chat with companies and research groups using DVC +to make their teams more productive. We're proud to be part of the growing MLOps +movement: so far, a majority of CI/CD for ML projects are implemented with DVC +under the hood. + +I can confidently say that DVC wouldn't have been possible without a lot of help +from our community. Thank you to everyone who has supported us: + +**DVC core team.** The DVC team has been the force driving our project's +evolution - we've grown from 2 to 12 full-time engineers, developers, and data +scientists. Half of the team is purely focus on DVC while the other half on +related to DVC new projects. We often get feedback about how fast our team +answers user questions - we've been told our user support is one of DVC's +"killer features". It's all thanks to this amazing team. + +**DVC contributors.** As of today, the DVC code base has +[126 individual contributors](https://github.com/iterative/dvc/graphs/contributors). +Many of these folks put hours into their code contribution. We're grateful for +their tenacity and generosity. + +![](../uploads/images/2020-05-04/vera-sativa.png)_Vera - 100th DVC contributor +[on GitHub](https://github.com/verasativa/)._ + +**Documentation contributors.** Another +[124 people contributed](https://github.com/iterative/dvc.org/graphs/contributors) +to the [DVC documentation](https://dvc.org/doc) and +[the website](https://dvc.org/). Every time a new person tries out DVC, they +benefit from the hard work that's gone into our docs. + +**Active community members.** Active DVC users help our team understand and +better anticipate their needs and identify priorities for development. They +share bright ideas for new features, locate and investigate bugs in code, and +welcome and support new users. + +**People who give DVC a shot.** Today, there are thousands of data scientists, +ML engineers, and developers using DVC on a regular basis. The number of users +is growing every week. Our [Discord channel](http://dvc.org/chat) has almost two +thousand users. Hundreds more connect with us through email and Twitter. To +everyone willing to try out DVC, thank you for the opportunity. + +## DVC 1.0 is the result of 3 years of learning + +All these contributions, big and small, have a collective impact on DVC's +development. I'm happy (and a bit nervous) to announce that a pre-release of a +brand new DVC 1.0 is ready for public beta testing. + +You can install the 1.0 pre-release from the master branch in our repo +(instruction [here](https://dvc.org/doc/install/pre-release)) or through pip: + +```dvc +$ pip install --upgrade --pre dvc +``` + +The new DVC is inspired by discussions and contributions from our community - +both fresh ideas and bug reports 😅. + +Here are the most significant features we’re excited to be rolling out soon: + +### [Run cache](https://github.com/iterative/dvc/issues/1234) + +_Learnings:_ Forcing users to make Git commits for each ML experiment creates +too much overhead. + +DVC 1.0 has a "long memory" of DVC commands runs. This means it can identify if +a `dvc repro` has already been run and save compute time by returning the cached +result - _even if you didn't Git commit that past run_. + +We added the run-cache with CI/CD systems and other MLOps and DataOps automation +tools in mind. No more auto-commits needed after `dvc repro` in the CI/CD system +side. + +### [Multi-stage DVC files](https://github.com/iterative/dvc/issues/1871) + +_Learnings:_ ML pipelines evolve much faster than data engineering pipelines. + +We redesigned the way DVC records data processing stages with metafiles, to make +pipelines more interpretable and editable. All pipeline stages are now saved in +a single metafile, with all stages stored together instead of in separate files. + +Data hash values are no longer stored in the pipeline metafile. This improves +human-readability. + +```yaml +stages: + process: + cmd: ./process_raw_data raw_data.log users.csv + deps: + - raw_data.log + params: + - process_file + - click_threshold + outs: + - users.csv + train: + cmd: python train.py + deps: + - users.csv + params: + - epochs + - log_file + - dropout + metrics_no_cache: + - summary.json + metrics: + - logs.csv + outs: + - model.pkl +``` + +### [Plots](https://github.com/iterative/dvc/issues/3409) + +_Learnings:_ Versioning metrics and plots are no less important than data +versioning. + +Countless users asked us when we'd support metrics visualizations. Now it's +here: DVC 1.0 introduces metrics file visualization commands, `dvc metrics diff` +and `dvc plots show`. DVC plots are powered by the +[Vega-Lite](https://vega.github.io/vega-lite/) graphic library. + +This function is designed not only for showing visualizations based on the +current state of your project, but it can also combine multiple plots from your +Git history in a single chart so you can compare results across commits. Users +can visualize how, for example, their model accuracy in the latest commit +differs from another commit (or even multiple commits). + +```dvc +$ dvc plots diff -d logs.csv HEAD HEAD^ d1e4d848 baseline_march +file:///Users/dmitry/src/plot/logs.csv.html +$ open logs.csv.html +``` + +![](../uploads/images/2020-05-04/dvc-plots.svg) + +```dvc +$ dvc plots diff -d logs.csv HEAD HEAD^ d1e4d848 baseline_march \ + -x loss --template scatter +file:///Users/dmitry/src/plot/logs.csv.html +$ open logs.csv.html +``` + +![](../uploads/images/2020-05-04/dvc-plots-scatter.svg) + +### [Data transfer optimizations](https://github.com/iterative/dvc/issues/3488) + +_Learnings:_ In ML projects, data transfer optimization is still the king. + +We've done substantial work on optimizing data management commands, such as +`dvc pull / push / status -c / gc -c`. Now, based on the amount of data, DVC can +choose an optimal data remote traversing strategy. + +[Mini-indexes](https://github.com/iterative/dvc/issues/2147) were introduced to +help DVC instantly check data directories instead of iterating over millions of +files. This also speeds up file adding/removing to large directories. + +More optimizations are included in the release based on performance bottlenecks +we profiled. More detailed +[benchmark report](https://gist.github.com/pmrowla/338d9645bd05df966f8aba8366cab308) +that shows how many second it takes to run a specific commands on 2M images +directory. + +![](../uploads/images/2020-05-04/benchmarks.svg) + +### [Hyperparameter tracking](https://github.com/iterative/dvc/issues/3393) + +_Learnings:_ ML pipeline steps depends only on a subset of config file. + +This feature was actually released in the last DVC 0.93 version (see +[params docs](https://dvc.org/doc/command-reference/params). However, it is an +important step to support configuration files and ML experiments in a more +holistic way. + +### For more information on the new features... + +Each of the big new features and improvements deserve a separate blog post. We +will be posting more - please stay in touch. + +I hope our the most active users will find time to check the DVC pre-release +version and provide their feedback. The installation instruction is +[on our website](https://dvc.org/doc/install/pre-release). + +## 5000 GitHub stars + +Activity on our GitHub page has grown organically since the DVC repo went public +on May 4th, 2017. Coincidentally, today, in the 3rd year anniversary we have +reached 5000 starts: + +![](../uploads/images/2020-05-04/5k_github.png) + +## Thank you! + +Thank you again to all DVC contributors, community members, and users. Every +piece of your help is highly appreciated and will bring huge benefits to the +entire ecosystem of data and ML projects. + +Stay healthy and safe, wherever you are in the world. And be in touch on +[Twitter](https://twitter.com/DVCorg), and our +[Discord channel](https://dvc.org/chat). diff --git a/content/blogs/2020-05-08-dvc-ambassador-program-announcement.md b/content/blogs/2020-05-08-dvc-ambassador-program-announcement.md new file mode 100644 index 0000000000..3383d65e86 --- /dev/null +++ b/content/blogs/2020-05-08-dvc-ambassador-program-announcement.md @@ -0,0 +1,190 @@ +--- +title: 'Join the DVC Ambassador Program!' +date: 2020-05-08 +description: > + We're launching our ambassador program for people all around the world to get + involved in the DVC community. +descriptionLong: > + We're launching our ambassador program for people all around the world to get + involved in the DVC community. Our first ambassador, Marcel Ribeiro-Dantas, + shares a guest blog about how ambassadors support open source projects through + blog writing, public outreach, and code. +picture: 2020-05-08/Ambassador_Header.png +author: marcel_rd +commentsUrl: https://discuss.dvc.org/t/join-the-dvc-ambassador-program/383 +tags: + - Ambassador + - Volunteer + - Meetup + - Blogging + - Company +--- + +DVC's software can be everywhere, but its developers can’t - that’s why +ambassadors, folks who do outreach and community building around projects they +love, are a key part of the open source community. DVC is starting an ambassador +program to help people who are passionate about our mission get involved. + +As the first DVC ambassador, and a +[Fedora ambassador](https://fedoraproject.org/wiki/User:Mribeirodantas) before +that, I can tell you a bit about the role. As a representative of open source +projects, I've participated in lots of events, made friends, and traveled. Every +single time I’ve contributed, I got this nice feeling that it was all worth it. +I believe that if you agree with the core values of the project, a great +relationship lies ahead :). + +So what are the core values of DVC, exactly? DVC is founded on the principle of +engineering solutions for making data science and machine learning rigorous and +reproducible. If this matters to you, too, you might be a good fit for our +ambassador program! + +As an ambassador, you’ll act as a bridge between DVC in your community. There +are lots of ways to do this, big and small. For example: + +- Write a blog post talking about how you use DVC in your projects +- What about creating a network of DVC users and data scientists in your town? + Even though we’re self-isolating now, you can still organize online meetups. + [We already did one!](https://tulu.la/events/dvc-virtual-meetup-2020-00032c) + We help cover costs to organize meetups. +- Do you want to talk about DVC at your office, or at a conference? We help + speakers develop talks, and we have some discretionary funds for travel on a + case-by-case basis. +- Want to develop a feature for DVC? We welcome contributions to the code base, + even if it’s your first pull request ever. + +Being an ambassador means getting closer to the team in charge of DVC, but at +the same time, it means going farther to reach people outside the organization- +including people who don’t know about DVC yet, people who need some help getting +started, and people who are already excited about our mission and want to find +meaningful ways to pitch in. + +## About Iterative and DVC + +DVC got started in 2017 as a personal project by Dmitry Petrov ( +[we just celebrated our 3rd birthday](https://dvc.org/blog/dvc-3-years-and-1-0-release)). +Previously, Dmitry worked at Microsoft as a data scientist and did a PhD in +Computer Science. In 2018, Dmitry teamed up with his co-founder Ivan Shcheklein +(co-founder of [The Tweeted Times](https://tweetedtimes.com/) and +[Sedna](https://www.sedna.org/) contributor) to incorporate Iterative.ai and +grow the project. Iterative.ai is building enterprise tools for collaboration on +ML projects. Currently, Iterative.ai's open source flagship project is Data +Version Control (DVC), an open source version control system for managing +complex workflows, datasets, and models. + +Development is ongoing in the core DVC project as well as new ventures into +[MLOps and Continuous Integration & Delivery (CI/CD)](https://dvc.org/blog/reimagining-devops-video) +for data science. The team is small-and-mighty, with developers, engineers, and +data scientists on four continents. The open source community is a huge part of +all Iterative.ai projects; currently, DVC has more than +[5,000 stars on GitHub](https://github.com/iterative/dvc) and more than 100 +individual contributors! + +One of DVC’s main principles is adapting existing software engineering practices +to machine learning. For example, DVC is built around Git version control: in an +ML project using DVC, each experiment corresponds to a Git commit. When you +check out any commit, you’ll see the source code as it was when you made the +commit- as expected. But, you’ll also see your datasets as they were and the +exact pipeline of commands you ran in that experiment! + +## Why become an ambassador? + +Like any volunteer position, the main benefit is getting to be involved in a +project you believe in. But there are some perks: + +- Establishing a formal relationship with DVC that can go on your CV/resume. + We'll boost your content on our social channels, too. +- Access to support from the DVC team, such as financial resources to organize + your own meetup for local data scientists and ML enthusiasts +- Mentorship about crafting blogs and talks, if desired. DVC team members + regularly help people in the community develop their presentations and blogs + for accuracy and clarity +- Closer relationships with the DVC team means more chances to participate in + conversations that guide our product decisions. + +For students and early career professionals, you can learn a lot by interacting +with us! While you can certainly write a blog post or organize a meetup without +being an ambassador, the program is a way to fast-track your learning- you'll +have the creators of DVC helping you understand it well, and helping you +discover features and best practices you might not have known about. + +If you're already active in the open source or MLOps community, then becoming an +ambassador is a solid way to cement your relationship with DVC. We'd love to +recognize you for the amazing stuff you already do. + +## How to become an ambassador + +If you’re interested in becoming an ambassador, send us an email at +[info@dvc.org](mailto:info@dvc.org) with the subject line “I want to be an +ambassador!” Please tell us: + +- A little about yourself and your professional background +- Any outreach work you’ve done before +- What kind of ambassador activities you’d be most interested in participating + in + +The program is structured to provide a lot of flexibility, so each ambassador +can do outreach in ways that are personally motivating and enjoyable. There are +a few guidelines: + +- We ask for at least one-year commitment +- We ask ambassadors to contribute at least four activities per year, about once + every three months. There's no upper limit to how much you can do! +- For your first contribution, we ask for a blog post- this way, we can + collaborate with you to help get all the technical details right. After that, + it’s up to you! + +## Some ideas to get started + +Our official ambassador program is just starting, but our community already has +a lot of folks making noise. Here are just a few contributions we admire- we +think they’re pretty cool inspirations for future projects. + +### Blogs and tutorials + +Shareable blogs are one of our most effective outreach strategies. They give +visibility to the author _and_ new ways to use DVC, so it's a win-win. + +- [Remote training with GitLab-CI and DVC](https://blog.codecentric.de/en/2020/01/remote-training-gitlab-ci-dvc/), + by Mercel Mikl and Bert Besser (Bert has also organized a DVC meetup in + Berlin) +- [Creating a solid Data Science development environment](https://towardsdatascience.com/creating-a-solid-data-science-development-environment-60df14ce3a34), + by Gabriel dos Santos Goncalves +- [Continuous Delivery for Machine Learning](https://martinfowler.com/articles/cd4ml.html), + by Danilo Sato, Arif Wider, and Christoph Windheuser +- [Manage your Data Science Project in R](https://mribeirodantas.xyz/blog/index.php/2020/03/05/r-dvc-and-rmarkdown/) + was my first blog post about using DVC in an R project! + +### Talks + +Community members have presented at events like PyCon, PyData, and local +meetups. + +- [Version control for data science](https://www.slideshare.net/AlessiaMarcolini/version-control-for-data-science), + by Alessia Marcolini @ PyCon DE & PyData Berlin +- [How to easily set up and version control your machine learning pipelines](https://www.youtube.com/watch?v=rUTlqpcmiQw), + by Sarah Diot-Girard & Stephanie Bracaloni @ PyData Amsterdam +- [ML models and dataset versioning](https://speakerdeck.com/kurianbenoy/ml-models-and-dataset-versioning), + by Kurian Benoy @ PyCon India + +### Code contributions + +Our GitHub repository has lots of open discussions about potential features- its +a goldmine for ways to pitch in. For example: + +- [Helge Munk Jacobsen](https://github.com/elgehelge) took on an open issue in + our code base about supporting hyperparameter tracking with DVC and made a + pull request to add this feature. + +- [Vera Sativa](https://github.com/verasativa/) added directory support to the + `dvc import-url` function- and she was our 100th contributor, so she won her + own DeeVee the owl. + +![](../uploads/images/2020-01-17/odd_with_deevee.png 'Vera and team =500')_Vera +(center, flashing a peace sign) thanked us with this lovely picture of DeeVee +and her team, [Odd Industries](https://odd.co)._ + +If any of this sounds fun to you, please be in touch over +[email](mailto:info@dvc.org) (and you can also reach us on +[Twitter](https://twitter.com/dvcorg) and our +[Discord Channel](https://discordapp.com/invite/dvwXA2N)). We look forward to +connecting with you! diff --git a/content/blogs/2020-05-14-may-20-dvc-heartbeat.md b/content/blogs/2020-05-14-may-20-dvc-heartbeat.md new file mode 100644 index 0000000000..0ce0514ca6 --- /dev/null +++ b/content/blogs/2020-05-14-may-20-dvc-heartbeat.md @@ -0,0 +1,180 @@ +--- +title: May ’20 DVC❤️Heartbeat +date: 2020-05-14 +description: > + Catch up on new DVC releases, talks, and projects in our community. This + month, learn about new features in the DVC 1.0 release, ways to get involved, + and more from the intersection of data science and software engineering. + +descriptionLong: > + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. + + Look here for updates about [DVC](https://dvc.org), our journey as a startup, + projects by our users and big ideas about best practices in ML and data + science. +picture: 2020-05-14/May_20_Heartbeat.png +pictureComment: A big hello from DVC mascot DeeVee. +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/dvc-heartbeat-may-20/391 +tags: + - Heartbeat + - Plots + - MLOps + - Meetup + - Google Season of Docs + - Ambassador +--- + +Welcome to the May Heartbeat, our [monthly roundup of cool happenings](#news), +[new releases](#new-releases), [good reads](#from-the-community) and other +noteworthy developments the DVC community. + +## News + +**DVC turns 3.** On May 4th, we celebrated DVC's third birthday! Fearless leader +Dmitry Petrov +[wrote a retrospective](https://dvc.org/blog/dvc-3-years-and-1-0-release) about +how the team has grown and what we've learned from our users, contributors, and +colleagues. Thanks to everyone who celebrated with us! + +**Ambassador program launched.** DVC has just kicked off our ambassador program +with the help of our first ambassador, +[Marcel Ribeiro-Dantas](https://twitter.com/messages/40813700-894970070358564864). +Marcel is an early-stage researcher at the Institut Curie, a veteran +[ambassador of the Fedora Project](https://fedoraproject.org/wiki/User:Mribeirodantas), +and a [data science blogger](http://mribeirodantas.me/). Becoming an ambassador +is a way for folks who are passionate about contributing to the DVC community to +get recognized for their efforts. It's also a way for us to help volunteers with +financial support for meetups and travel, as well as chances to work more +closely with our team. The program is ideal for anyone who already likes +blogging about DVC, contributing code, and hosting get-togethers (virtual or +otherwise), but especially advanced students and early career data scientists +and engineers! +[Learn more about it here.](https://dvc.org/blog/dvc-ambassador-program-announcement) + +**DVC is part of 2020 Google Season of Docs.** Another way to get involved with +DVC is through +[Google Season of Docs](https://developers.google.com/season-of-docs), a program +we're participating in for the second year in a row. This program is for +technical writers to get paid experience working with the DVC team in fall 2020. +Right now, we're accepting proposals from interested writers. +[Find out more here.](https://dvc.org/blog/gsod-ideas-2020) + +**5000 GitHub Stars.** It finally happened- we passed 5,000 stars +[on our GitHub repo!](https://github.com/iterative/dvc) + +![Animated GIF](https://media.giphy.com/media/igWE67cPgTrWwXq4Nz/giphy.gif) + +## New releases + +Coincident with DVC's 3rd birthday, we shared a pre-release of DVC 1.0. The +release is expected in a few weeks, but you can experiment with 1.0 now (and +make [tickets in our project repo](https://github.com/iterative/dvc) if you get +a bug 🐛). Some major new features include: + +- **Run cache**, a cache of pipelines you've reproduced on your local workspace. + If you re-run `dvc repro` on a pipeline version that's already been executed, + run cache will save you compute time by returning the cached result. + +- **Multi-stage DVC files**. Users reported that their DVC pipelines changed a + lot, so we've made pipeline `.dvc` files more human-readable and editable for + fast redesigns. + +- **Plots** We've got plots powered by + [Vega-Lite](https://vega.github.io/vega-lite/) for making beautiful + vizualizations comparing model performance across commits! Developer Paweł + Redzyński is hard at work: + +https://twitter.com/Paffciu1/status/1260119918525194241 + +You can read more about the big updates coming in DVC 1.0 +[in our birthday blog](https://dvc.org/blog/dvc-3-years-and-1-0-release#dvc-10-is-the-result-of-3-years-of-learning). + +## From the community + +Developers weren't the only ones hustling this month... + +**First ever virtual DVC Meetup.** Marcel, our new ambassador, lead an +initiative to +[organize a virtual meetup](https://tulu.la/events/dvc-virtual-meetup-2020-00032c)! +Marcel shared his latest scientific work about creating a +[new comprehensive dataset about mobility](https://www.sciencedirect.com/science/article/pii/S2352340920305928?via%3Dihub) +during the COVID-19 pandemic and then passed off the mic to our two guest +speakers. Data scientist [Elizabeth Hutton](https://github.com/ehutt) spoke how +she was building a workflow for her NLP team with DVC, and +[DAGsHub](https://dagshub.com/) co-founder +[Dean Pleban](https://twitter.com/DeanPlbn) shared his custom remote file system +setup for modeling Reddit post popularity. It was quite well-attended for our +first ever virtual hangout: we logged 40 individual logins to the meetup with +more than 30 people staying the whole time! A video of the meetup is +[on the event page](https://tulu.la/events/dvc-virtual-meetup-2020-00032c), so +you can still check out the talks and discussion we enjoyed. + +https://twitter.com/DeanPlbn/status/1258475031530790916 + +**Some blogs we like.** As usual, there's a lot of share-worthy writing in the +data science and MLOps space: + +- [Tania Allard](https://twitter.com/ixek) wrote an intensely readable, + extremely sharp guide to practical steps anyone can take to improve the + reproducibility of their ML projects. She really nails the complexity of the + workflow and the importance of decoupling code and data (which we obviously + agree with very much 😏). The graphics are also 💯- Tania is a developer + advocate to follow. + + + +- [Vimarsh Karbhari](https://medium.com/@vimarshk) blogged about how teams that + work with data can strategize better about versioning their data and analysis + pipelines. On the opposite end of giving very practical recommendations, + Vimarsh stresses a deliberate and careful approach. He emphasizes how the + team's choices should depend on factors like project maturity and how much + flexibility is going to be needed. It's a solid overview of how to begin + thinking about MLOps at a high level. + + + +- Over at [AutoRegresed](https://www.autoregressed.com/), Jack Pitts shared a + thorough tutorial about using [Pipenv](https://pypi.org/project/pipenv/), DVC + and Git together. As a trio, this manages dependencies and versions the + working environment, source code, dataset _and_ trained models. It's not only + a cool use case, but a very clear step-by-step explanation that should be easy + to try at home. Stay till the end for a neat trick about deploying a model as + a web service with Pipenv and DVC. + + + +## Nice tweets + +Last, here are some of our favorite tweets to read this past month: + +https://twitter.com/braaannigan/status/1257918525345234949 + + + +https://twitter.com/tcgarvin/status/1258855168436813826 + +_Thank you, thank you very much._ + +![Thank You Very Much GIF by The Wiggles](https://media.giphy.com/media/gJ2sDSKAQHUCIYUFhx/giphy.gif) + +As always, we want to hear what you're making with DVC and what you're reading. +Tell us in the blog comments, and be in touch on +[Twitter](https://twitter.com/DVCorg) and +[Discord channel](https://dvc.org/chat). Happy coding! diff --git a/content/blogs/2020-05-26-may-20-community-gems.md b/content/blogs/2020-05-26-may-20-community-gems.md new file mode 100644 index 0000000000..78d00dee74 --- /dev/null +++ b/content/blogs/2020-05-26-may-20-community-gems.md @@ -0,0 +1,132 @@ +--- +title: May '20 Community Gems +date: 2020-05-26 +description: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + development best practices, sharing models and data across projects, and using + DVC with teams. +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + development best practices, sharing models and data across projects, and using + DVC with teams. +picture: 2020-05-26/May_20_Gems_Header.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/may-20-community-gems/398 +tags: + - Community Gems + - Cache + - Google Cloud Storage + - Import +--- + +## Discord gems + +Here are some Q&A's from our Discord channel that we think are worth sharing. + +### Q: [How do I completely delete a file from DVC?](https://discord.com/channels/485586884165107732/563406153334128681/710546561498873886) + +To stop tracking a file with DVC, you can simply delete the file and its +corresponding `.dvc` file (if there is one) from your project. But, what if you +want to entirely erase a file from DVC? + +After deleting the `.dvc` file, you'll usually want to +[clear your DVC cache](https://dvc.org/doc/command-reference/gc#gc). Ordinarily, +that's done with `dvc gc`. However, if there's any chance the file you wish to +remove might be referenced by another commit (even under a different name), be +sure to use the right flag: `dvc gc --all-commits`. + +If you want to remove a single `.dvc` file without doing a cache cleanup, look +into the `.dvc` file and note the `md5` field inside. Then use this value to +identify the corresponding file in your `.dvc/cache` and delete it. For example: +if your target file has `md5`: 123456, the corresponding file in your cache will +be `.dvc/cache/12/3456`. + +There's one last case worth mentioning: what if you're deleting a file inside a +DVC-tracked folder? For example, say you've previously run + +```dvc +dvc add data_dir +``` + +and now want to remove a single file (say, `image_1.png`) from `data_dir`. When +DVC starts tracking a directory, it creates a corresponding `.dir` file inside +`.dvc/cache` that lists every file and subfolder, as well as an `md5` for each, +in a JSON format. You'll want to locate this `.dir` file in the cache, and then +find the entry corresponding to `image_1.png`. It'll give the `md5` for +`image_1.png`. Finally, go back to `.dvc/cache`, identify the file corresponding +to that `md5`, and delete it. For detailed instructions about `.dir` files, +where to find them and how they're used, +[see our docs about the structure of the cache](https://dvc.org/doc/user-guide/dvc-internals#structure-of-cache-directory). + +Having said all this... please know that in the future, we plan to support a +function like `git rm` that will allow easier deletes from DVC! + +### Q: [Is it safe to add a custom file to my DVC remote?](https://discord.com/channels/485586884165107732/563406153334128681/707551737745244230https://discord.com/channels/485586884165107732/563406153334128681/707551737745244230) + +Definitely. Some people add additional files to their DVC remote, like a README +to explain to teammates what the folder is being used for. Having an additional +file in the remote that isn't part of DVC tracking won't pose any issues. You +would only encounter problems if you were manually modifying or deleting +contents of the remote managed by DVC. + +### Q: [Are there limits to how many files DVC can handle? My dataset contains ~100,000 files.](https://discord.com/channels/485586884165107732/563406153334128681/706538115048669274) + +We ourselves have stored datasets containing up to 2 million files, so 100,000 +is certainly feasible. Of course, the larger your dataset, the more time data +transfer operations will take. Luckily, we have a +[DVC 1.0 contains several data transfer optimizations](https://dvc.org/blog/dvc-3-years-and-1-0-release#data-transfer-optimizations) +to substantially reduce the time needed to `dvc pull / push / status -c / gc -c` +for very large datasets. + +### Q: [Two developers on my team are doing `dvc push` to the same remote. Should they `dvc pull` first?](https://discord.com/channels/485586884165107732/563406153334128681/704211629075857468) + +It's safe to push simultaneously, no `dvc pull` needed. While some teams might +be in the habit of frequently pulling, like in Git flow, there are less risks of +"merge conflicts" in DVC. That's because DVC remotes stores files indexed by +`md5`s, so there's usually a very low probability of a collision (if two +developers have two different versions of a file, they'll be stored as two +separate files in the DVC remote- so no merge conflicts). + +### Q: [What are `*.tmp` files in my DVC remote?](https://discord.com/channels/485586884165107732/563406153334128681/698163554095857745) + +Inside your DVC remote, you might see `.tmp` files from incomplete uploads. This +can happen if a user killed a process like `dvc push`. You can safely remove +them; for example, if you're using an S3 bucket, `aws s3 rm ... *.tmp` will do +the trick. + +One caveat: before you delete, make sure no one is actively running `dvc push`. + +### Q: [I'm using a Google Cloud Platform (GCP) bucket as a DVC remote and getting an error. Any ideas?](https://discord.com/channels/485586884165107732/485596304961962003/705131622537756702) + +If you're getting the error, + +``` +ERROR: unexpected error - ('invalid_grant: Bad Request', '{\n "error": "invalid_grant",\n "error_description": "Bad Request"\n}') +``` + +something is going wrong with your GCP authentication! A few things to check: +first, +[check out our docs](https://dvc.org/doc/command-reference/remote/add#supported-storage-types) +to `dvc remote add` a Google Cloud bucket as your remote. Note that before DVC +can use this type of remote, you have to configure your credentials through the +GCP CLI +([see docs here](https://dvc.org/doc/command-reference/remote/add#supported-storage-types)). + +If you're still getting an error, DVC probably can't find the `.json` +credentials file for your GCP bucket. Try authenticating using +`gcloud beta auth application-default login`. This command obtains your access +credentials and places them in a `.json` in your local workspace. + +### Q: [I'm working on several projects that all need involve the same saved model. One project trains a model and pushes it to cloud storage with `dvc push`, and another takes the model out of cloud storage for use. What's the best practice for doing this with DVC?](https://discord.com/channels/485586884165107732/485596304961962003/708318821253120040) + +One of DVC's goals is to make it easy to move models and datasets in and out of +cloud storage. We had this in mind when we designed the function `dvc import` - +it lets you reuse artifacts from one project to another. And you can quickly +synchronize an artifact, like a model or dataset, with its latest version using +`dvc update`. Check out our +[docs about `import`](https://dvc.org/doc/command-reference/import), and also +our [data registry use case](https://dvc.org/doc/use-cases/data-registries) for +an example of sharing artifacts across projects. + +![](../uploads/images/2020-05-26/data-registry.png) _Using DVC for sharing +artifacts like datasets and models across projects and teammates._ diff --git a/content/blogs/2020-06-08-june-20-dvc-heartbeat.md b/content/blogs/2020-06-08-june-20-dvc-heartbeat.md new file mode 100644 index 0000000000..1786c5d2ee --- /dev/null +++ b/content/blogs/2020-06-08-june-20-dvc-heartbeat.md @@ -0,0 +1,214 @@ +--- +title: June ’20 Heartbeat +date: 2020-06-08 +description: > + Catch up on new DVC releases, talks, and projects in our community. This + month, learn about finishing touches on DVC 1.0, DVC in biomedical research, + recommended reading and upcoming MLOps talks. +descriptionLong: > + Catch up on new DVC releases, talks, and projects in our community. This + month, learn about finishing touches on DVC 1.0, DVC in biomedical research, + recommended reading and upcoming MLOps talks. +picture: 2020-06-08/June_20_Heartbeat_small.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/june-20-heartbeat/404 +tags: + - Heartbeat + - Udemy Course + - Pipelines + - Plots + - MLOps +--- + +Welcome to the June Heartbeat, our monthly roundup of cool happenings, +[good reads](#from-the-community) and +[up-and-coming developments](#coming-up-soon) in the DVC community. + +## News + +In the beginning of May, we +[pre-released DVC 1.0](https://dvc.org/blog/dvc-3-years-and-1-0-release). Ever +since, we've been putting the final touches on 1.0- wrapping up features, fixing +bugs 🐛, and responding to feedback from intrepid users +[trying the pre-release](https://dvc.org/doc/install/pre-release). To recap, +here are some of the big features coming: + +- **Plots powered by Vega-Lite** We're building + [functions for visualizing metrics](https://dvc.org/doc/command-reference/plots#plots) + in your project, as well as comparing metrics across commits. We chose + [Vega-Lite plots](https://github.com/vega/vega-lite) because they're + high-level, compatible with ML projects written in any language, and beautiful + by default. + +- **Human readable and writeable pipelines.** We're reworking pipelines so you + can modify dependencies, outputs, metrics, plots, and entire stages easily: + via manual edits to a `.yaml` pipeline fines. This redesign will consolidate + pipeline `.dvc` files into a single file (yay, simpler working directory). No + worries for pipeline enthusiasts- DVC 1.0 is backwards compatible, so your + existing projects won't be interrupted. + +- **Run cache.** One of the most exciting features is the run-cache, a local + record of pipeline versions that have previously been run and the outputs of + those runs. It can seriously cut down on compute time if you find yourself + repeating pipeline executions. For our CI/CD users, it also offers a way to + save the output of your pipeline- like models or results- + [without auto-commits](https://stackoverflow.com/questions/61245284/will-you-automate-git-commit-into-ci-cd-pipline-to-save-dvc-run-experiments). + +DVC 1.0 work has been our top priority this past month, and we are _extremely +close_ to the releae. Think 1-2 weeks! + +Another neat announcement: DVC moved up on +[ThoughtWorks Technology Radar](https://www.thoughtworks.com/radar/tools)! To +quote ThoughtWorks: + +> In 2018 we mentioned DVC in conjunction with the versioning data for +> reproducible analytics. Since then it has become a favorite tool for managing +> experiments in machine learning (ML) projects. Since it's based on Git, DVC is +> a familiar environment for software developers to bring their engineering +> practices to ML practice. Because it versions the code that processes data +> along with the data itself and tracks stages in a pipeline, it helps bring +> order to the modeling activities without interrupting the analysts’ flow. + +And here we are on the radar, in the Trial zone: + +![](../uploads/images/2020-06-08/radar.png) _Blip, blip, blip!_ + +We are honored. In fact, this was validating in several ways. We field a lot of +questions about our decision to build around Git, rather than creating a +platform. It's awesome to know our approach is resonating with teams at the +intersection of ML and software development. Thanks, ThoughtWorks! + +Last up in company news: you might recall that in early May, we hosted an online +meetup. [Marcel Ribeiro-Dantas](http://mribeirodantas.me) hosted guest talks +from [Elizabeth Hutton](https://github.com/ehutt) and +[Dean Pleban](https://twitter.com/DeanPlbn)- we heard about constructing a new +COVID-19 dataset, using DVC with transformer language models, and building +custom cloud infrastructure for MLOps. There's also Q&A with the DVC team, where +we fielded audience questions. A video of the meetup is available now, so check +it out if you missed the event. + +https://youtu.be/19GMtrFykSU + +## From the community + +As usual, there's a ton of noteworthy action in the DVC community. + +[Derek Haynes](https://twitter.com/dhaynes23), MLOps expert and new +[DVC Ambassador](https://dvc.org/blog/dvc-ambassador-program-announcement)- +wrote an excellent overview of using +[GitHub CodeSpaces](https://github.com/features/codespaces/). CodeSpaces is a +new development environment (currently in beta) that we're eagerly watching. As +Derek shows in his blog, it lets you have a Jupyter Notebook experience without +sacrificing on development standards- he uses +[whisk](https://docs.whisk-ml.org/en/latest/) to structure the project and +manage Python package dependencies, and DVC to version the model training +pipeline. + +This use case is telling in the +[battle over Jupyter notebooks](https://towardsdatascience.com/the-case-against-the-jupyter-notebook-d4da17e97243): +we might just be able to have both a notebook _and_ mature project management. +Give Derek's blog a read and tell us what you think. + + + +DVC Ambassador Marcel gave a tutorial about DVC to a bioinformatics student +group, and then an even bigger talk at the Federal University of Rio Grande de +Norte. His talk focused on how to use DVC in the context of scientific +reproducibility- specifically, large biological datasets, which are often +transformed and processed several times before ML models are fit. In my +experience, Git-flow is severely underutilized in life sciences research, so +it's exciting to see Marcel's ideas getting a big audience. + +https://twitter.com/ppgeecufrn/status/1263260554443005954 + +Also, Marcel is the first author of a new scientific paper about mobility data +across 131 countries during the COVID-19 pandemic. The preprocessing pipeline is +versioned with DVC. We don't know how Marcel gets this much done. + + + +Also just released is a scientific paper by Christoph Jansen et al. about a +framework for computational reproducibility in the life sciences that integrates +DVC. The framework is called +[Curious Containers](https://github.com/curious-containers/curious-containers)- +definitely worth checking out for biomedical researchers interested in deep +learning. + + + +In other work of vital interest to the good of humanity, this month has seen +some awesome applictions of the +[public Reddit dataset we released in February](https://dvc.org/blog/a-public-reddit-dataset). +The dataset is designed for an NLP task of mighty importance: will Redditors +vote that the poster is an asshole, or not? + +Daniele Gentile beat our benchmark classifier (62% accuracy, but not bad for +logistic regression!) with Doc2Vec embeddings and a 500-neuron network. He got +71% accuracy on held out data- nice! His blog is a fun read, and code's included +if you want to follow along. + + + +Elsewhere on the internet, data scientist Dan Cassin delivered this beautiful +tweet: + +https://twitter.com/Dan_Cassin/status/1256999648901787648 + +Last, I want to point you to two other excellent blogs. +[Venelin Valkov](https://github.com/curiousily) released a blog, +[Reproducible machine learning and experiment tracking pipeline with Python and DVC](https://www.curiousily.com/posts/reproducible-machine-learning-and-experiment-tracking-pipiline-with-python-and-dvc/), +that contains not only a detailed sample project but a livecoding video! + +https://youtu.be/6_kK6wRtzhk + +[Matthew McAteer](https://www.linkedin.com/in/matthewmcateer0/) revisited the +famous 2015 +[Hidden Technical Debt in Machine Learning Systems](https://papers.nips.cc/paper/5656-hidden-technical-debt-in-machine-learning-systems.pdf) +paper to ask which recommendations still work five years later. It's pretty +great- +[please read it](https://matthewmcateer.me/blog/machine-learning-technical-debt/). + +![](../uploads/images/2020-06-08/spongebob.png) _Meme by Matthew McAteer. Click +to enlarge._ + +## Coming up soon + +There are a couple of events to look forward to in the next few weeks. I'll be +speaking at two conferences: first, +[MLOps World](https://mlopsworld.com/program/) about CI/CD and ML. Next, I'm +[organizing a workshop](https://computationalaudiology.com/the-critical-role-of-computing-infrastructure-in-computational-audiology/) +at the Virtual Conference on Computational Audiology. To get ready, I'm +gathering resources about good computing practices for scientists and biomedical +research labs- +[contributions are welcome](https://github.com/andronovhopf/Lab_Computing_Resources). + +Another talk on our radar is at EuroPython 2020. Engineer +[Hongjoo Lee will be talking about building a CI/CD workflow for ML with DVC](https://ep2020.europython.eu/talks/CXG7TcM-automating-machine-learning-workflow-with-dvc/)- +we're very interested to learn about their approach. + +Lastly, [ML REPA](http://ml-repa.ru/) leader and new DVC Ambassador +[Mikhail Rozhkov](https://twitter.com/mnrozhkov) is working on a Udemy course +about DVC. Look for more updates this summer! + +Thanks for reading this month. As always, we're proud of the ways our community +works for better, more rigorous ML. diff --git a/content/blogs/2020-06-22-dvc-1-0-release.md b/content/blogs/2020-06-22-dvc-1-0-release.md new file mode 100644 index 0000000000..dc58939f72 --- /dev/null +++ b/content/blogs/2020-06-22-dvc-1-0-release.md @@ -0,0 +1,284 @@ +--- +title: 'DVC 1.0 release: new features for MLOps' +date: 2020-06-22 +description: > + Today we're releasing DVC 1.0 with new exciting features that users were + waiting for ❤️. Find all the details in this blog post. +descriptionLong: > + Today we're releasing DVC 1.0. It brings new exciting features that users were + waiting for ❤️. DVC is a more mature product now, with stable release cycles + and benchmarks. Find all the details in this blog post. +picture: 2020-06-22/release.png +pictureComment: DVC 1.0 release +author: dmitry_petrov +commentsUrl: https://discuss.dvc.org/t/dvc-1-0-release/412 +tags: + - Release + - MLOps + - DataOps + - CI/CD +--- + +## Introduction + +3 years ago, I was concerned about good engineering standards in data science: +data versioning, reproducibility, workflow automation — like continuous +integration and continuous delivery (CI/CD), but for machine learning. I wanted +there to be a "Git for data" to make all this possible. So I created DVC (Data +Version Control), which works as version control for data projects. + +Technically, DVC codifies your data and machine learning pipelines as text +metafiles (with pointers to actual data in S3/GCP/Azure/SSH), while you use Git +for the actual versioning. DevOps folks call this approach GitOps or, more +specifically, in this case _DataOps_ or _MLOps_. + +The new DVC 1.0. is inspired by discussions and contributions from our community +of data scientists, ML engineers, developers and software engineers. + +## DVC 1.0 + +The new DVC 1.0 is inspired by discussions and contributions from our community +— both fresh ideas and bug reports 😅. All these contributions, big and small, +have a collective impact on DVC's development. I'm confident 1.0 wouldn't be +possible without our community. They tell us what features matter most, which +approaches work (and which don't!), and what they need from DVC to support their +ML projects. + +A few weeks ago we announced the 1.0 pre-release. After lots of helpful feedback +from brave users, it's time to go live. Now, DVC 1.0 is available with all the +standard installation methods including `pip`, `conda`, `brew`, `choco`, and +system-specific packages: deb, rpm, msi, pkg. See https://dvc.org/doc/install +for more details. + +## New features + +It took us 3 years to finalize the requirements for DVC 1.0 and stabilize the +commands (API) and DVC file formats. Below are the major lessons that we have +learned in 3 years of this journey and how these are reflected in the new DVC. + +### [Multi-stage DVC files](https://github.com/iterative/dvc/issues/1871) + +Our users taught us that ML pipelines evolve much faster than data engineering +pipelines with data processing steps. People need to change the commands of the +pipeline often and it was not easy to do this with the old DVC-files. + +In DVC 1.0, the DVC metafile format was changed in three big ways. First, +instead of multiple DVC "stage files" (`*.dvc`), each project has a single +`dvc.yaml` file. By default, all stages go in this single YAML file. + +Second, we made clear connections between the `dvc run` command (a helper to +define pipeline stages), and how stages are defined in `dvc.yaml`. Many of the +options of `dvc run` are mirrored in the metafile. We wanted to make it far less +complicated to edit an existing pipeline by making `dvc.yaml` more human +readable and writable. + +Third, file and directory hash values are no longer stored in the pipeline +metafile. This approach aligns better with the GitOps paradigms and simplifies +the usage of DVC by tremendously improving metafile human-readability: + +```yaml +stages: + process: + cmd: ./process_raw_data raw_data.log users.csv + deps: + - raw_data.log + params: + - process_file + - click_threshold + outs: + - users.csv + train: + cmd: python train.py + deps: + - users.csv + params: + - epochs + - log_file + - dropout + metrics: + - logs.csv + - summary.json: + cache: false + outs: + - model.pkl +``` + +All of the hashes have been moved to a special file, `dvc.lock`, which is a lot +like the old DVC-file format. DVC uses this lock file to define which data files +need to be restored to the workspace from data remotes (cloud storage) and if a +particular pipeline stage needs to be rerun. In other words, we're separating +the human-readable parts of the pipeline into `dvc.yaml`, and the auto-generated +"machine" parts into `dvc.lock`. + +Another cool change: the auto-generated part (`dvc.lock`) doesn't necessarily +have to be stored in your Git repository. The new run-cache feature eliminates +the need of storing the lock file in Git repositories. That brings us to our +next big feature: + +### [Run cache](https://github.com/iterative/dvc/issues/1234) + +We built DVC with a workflow in mind: one experiment to one commit. Some users +love it, but this approach gets clunky fast for others (like folks who are +grid-searching a hyperparameter space). Making Git commits for each ML +experiment was a requirement with the old DVC, if you wanted to snapshot your +project or pipelines on each experiment. Moving forward, we want to give users +more flexibility to decide how often they want to commit. + +We had an insight that data remotes (S3, Azure Blob, SSH etc) can be used +instead of Git for storing the codified meta information, not only data. In DVC +1.0, a special structure is implemented, the run-cache, that preserves the state +(including all the hashes). Basically, all the information that is stored in the +new `dvc.lock` file is replicated in the run-cache. + +The advantage of the run-cache is that pipeline runs (and output file versions) +are not directly connected to Git commits anymore. The new DVC can store all the +runs in the run-cache, even if they were never committed to Git. + +This approach gives DVC a "long memory" of DVC stages runs. If a user tries to +run a stage that was previously run (whether committed to Git or not), then DVC +can return the result from the run-cache without rerunning it. It is a useful +feature for a hyperparameter optimization stage — when users return to the +previous sets of the parameters and don't want to wait for ML retraining. + +Another benefit of the run-cache is related to CI/CD systems for ML, which is a +holy grail of MLOps. The long memory means users don't have to make auto-commits +in their CI/CD system side - see +[this Stackowerflow question](https://stackoverflow.com/questions/61245284/will-you-automate-git-commit-into-ci-cd-pipline-to-save-dvc-run-experiments). + +### [Plots](https://github.com/iterative/dvc/issues/3409) + +Countless users have asked when we'd support metrics visualizations. It became +clear that metrics and their visualization are an essential part of _DataOps_, +especially when it comes down to navigation around ML models, datasets and +experiments. Now it's here: DVC 1.0 introduces metrics file visualization +commands, `dvc plots diff` and `dvc plots show`. This is brand-new functionality +in DVC and it's _in experimental mode_ now. + +This function is designed not only for visualizing the current state of your +project, but also for comparing plots across your Git history. Users can +visualize how, for example, their model accuracy in the latest commit differs +from another commit (or even multiple commits). + +```dvc +$ dvc plots diff -d logs.csv HEAD HEAD^ d1e4d848 baseline_march +file:///Users/dmitry/src/plot/logs.csv.html +$ open logs.csv.html +``` + +![](../uploads/images/2020-05-04/dvc-plots.svg) + +```dvc +$ dvc plots diff -d logs.csv HEAD HEAD^ d1e4d848 baseline_march \ + -x loss --template scatter +file:///Users/dmitry/src/plot/logs.csv.html +$ open logs.csv.html +``` + +![](../uploads/images/2020-05-04/dvc-plots-scatter.svg) + +DVC plots are powered by the +[Vega-Lite graphic library](https://vega.github.io/vega-lite/). We picked Vega +because it's high-level to manipulate, compatible with all ML frameworks, and +looks great out of the box. However, you don't have to know Vega to use DVC +plots: we've provided default templates for line graphs, scatterplots, and +confusion matrices, so you can just point DVC plots to your metrics and go. + +### [Data transfer optimizations](https://github.com/iterative/dvc/issues/3488) + +In _DataOps_, data transfer speed is hugely important. We've done substantial +work to optimize data management commands, like +`dvc pull / push / status -c / gc -c`. Now, based on the amount of data to move, +DVC can choose the optimal strategy for traversing your data remote. + +[Mini-indexes](https://github.com/iterative/dvc/issues/2147) help DVC instantly +check data directories instead of iterating over millions of files. This also +speeds up adding/removing files to/from large directories. + +More optimizations are included in the release based on our profiling of +performance bottlenecks. More detailed +[benchmark reports](https://gist.github.com/pmrowla/338d9645bd05df966f8aba8366cab308) +show how many seconds it takes to run specific commands on a directory +containing 2 million images. + +![](../uploads/images/2020-05-04/benchmarks.svg) + +### [Hyperparameter tracking](https://github.com/iterative/dvc/issues/3393) + +This feature was actually released in the last DVC 0.93 version (see the +[params docs](https://dvc.org/doc/command-reference/params). However, it is an +important step to support configuration files and ML experiments in a more +holistic way. + +The parameters are a special type of dependency in the pipelines. This is the +way of telling DVC that a command depends not on a file (`params.yaml`) but on a +particular set of values in the file: + +```dvc +$ dvc run -d users.csv -o model.pkl \ + --params lr,train.epochs,train.layers \ + python train.py +``` + +The `params.yaml` file is the place where the parameters are stored: + +```yaml +lr: 0.0041 + +train: + epochs: 70 + layers: 9 + +process: + thresh: 0.98 + bow: 15000 +``` + +### Stable releases cycles + +Today, many teams use DVC in their daily job for modeling and as part of their +production MLOps automation systems. Stability plays an increasingly important +role. + +We've always prioritized agility and speed in our development process. There +have been weeks with two DVC releases! This approach had a ton of benefits in +terms of learning speed and rapid feedback from users. + +Now we're seeing signs that it's time to shift gears. Our API is stabilized and +version 1.0 is built with our long-term vision in mind. Our user-base has grown +and we have footing with mature teams - teams that are using DVC in +mission-critical systems. That's why we're intentionally going to spend more +time on release testing in the future. It might increase the time between +releases, but the quality of the tool will be more predictable. + +Additionally, we've already implemented a benchmark testing framework to track +performance across potential releases: https://iterative.github.io/dvc-bench/ In +this website, anyone can see the performance improvements and degradations for +every release candidate: + +![](../uploads/images/2020-06-22/dvc-benchmark.png) + +### For more information on the new features... + +Each of these new features has a story that could fill a separate blog post - so +that's what we'll be doing. We'll be posting more soon. +[Peter Rowlands](https://github.com/pmrowla) will be writing a blog post about +the performance optimization in DVC 1.0, +[Paweł Redzyński](https://github.com/pared) about versioning and visualizing +plots, [Saugat Pachhai](https://github.com/skshetry) about the new DVC file +formats and pipelines, and [Ruslan Kuprieiev](https://github.com/efiop) about +run-cache. + +Please stay in touch and subscribe to our newsletter in http://dvc.org. + +## Thank you! + +It's quite a journey to build an open source project in the ML/AI space. We're +fortunate to have a community of DVC users, contributors and cheerleaders. All +these folks tremendously help us to define, test and develop the project. We've +reached this significant milestone of version 1.0 together and I hope we'll +continue working on DVC and bringing the best practices of DataOps and MLOps to +the ML/AI space. + +Thank you again! And please be in touch on +[Twitter](https://twitter.com/DVCorg), and our +[Discord channel](https://dvc.org/chat). diff --git a/content/blogs/2020-06-26-scipy-2020-dvc-poster.md b/content/blogs/2020-06-26-scipy-2020-dvc-poster.md new file mode 100644 index 0000000000..b58d3492bc --- /dev/null +++ b/content/blogs/2020-06-26-scipy-2020-dvc-poster.md @@ -0,0 +1,231 @@ +--- +title: 'Packaging data and machine learning models for sharing' +date: 2020-06-26 +description: > + A virtual poster for SciPy 2020 about sharing versioned datasets and ML models + with DVC. +descriptionLong: > + A virtual poster for SciPy 2020 about sharing versioned datasets and ML models + with DVC. +picture: 2020-06-26/SciPy_2020.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/packaging-data-and-machine-learning-models-for-sharing/423 +tags: + - Import + - SciPy + - Python + - Tutorial +--- + +When I was doing my Ph.D., every time I published a paper I shared a public +GitHub repository with my dataset and scripts to reproduce my statistical +analyses. While it took a bit of work to get the repository in good shape for +sharing (cleaning up code, adding documentation), the process was +straightforward: upload everything to the repo! + +But when I started working on deep learning projects, things got considerably +more complicated. For example, in a +[data journalism project I did with The Pudding](https://pudding.cool/2019/11/big-hair/), +I wanted to understand how hair style (particularly size!) changed over the +years. There were a lot of moving parts: + +- A public dataset of yearbook photos released and maintained by + [Ginosar et al.](https://people.eecs.berkeley.edu/~shiry/projects/yearbooks/yearbooks.html) +- A deep learning model I trained to segment the hair in yearbook photos +- A derivative dataset of "hair maps" for each photo in the original datasetr +- All the code to train the deep learning model and analyse the derivative + dataset + +![](../uploads/images/2020-06-26/hairflow.png) _The parts of my big-hair-data +project: an original public dataset, a model for segmenting the images, a +derivative dataset of segment maps, and analysis scripts._ + +How would you share this with a collaborator, or open it up to the public? +Throwing it all in a GitHub repository was not an option. My model wouldn't fit +on GitHub because it was over the 100 MB size limit. I also wanted to preserve a +clear link between my derived dataset and the original- it should be obvious +exactly how I got the public dataset. And if that public dataset were to ever +change, I would ideally want it to be clear what version I used for my analyses. + +This blog is about several different ways of "releasing" data science projects, +with an emphasis on preserving meaningful links about the origins of derived +data and models. I'm not making any strong assumptions about whether project +materials are relased within an organization (only to teammates, for example) or +to the whole internet. + +Let's look at a few methods. + +# Method One: artifacts in the cloud + +When you work with big models and datasets, you often can't host them in a +GitHub repo. But you can put them in cloud storage, and then provide a script in +your GitHub repo to download them. For example, in the fantastic `gpt-2-simple` +[project by Max Woolf](https://github.com/minimaxir/gpt-2-simple), Max stores +huge GPT-2 models in Google Drive and provides a script to download a specified +model to a user's local workspace if it isn't already there. + +Likewise, the [Nvidia StyleGAN release](https://github.com/NVlabs/stylegan) +provides a hardcoded URL to their model in Google Drive storage. Both the +`gpt-2-simple` and StyleGAN projects have custom scripts to handle these big +downloads, and largely thanks to the work of the project maintainers, users only +interact with the downloading process at a very high level. + +Considering some pros and cons of this approach: + +| **Pros** | **Cons** | +| :-----------------------------------: | :----------------------------: | +| It's easy to put a model in a bucket | Hardcoded links are brittle | +| Works for pip packages | Need to write custom functions | +| No extra tools, just Python scripting | Downloads aren't versioned | + +# Method Two: Hubs, Catalogs & Zoos + +There are a (growing) number of websites willing to long-term host big models +and datasets, plus relevant meta-data, code, and publications. Some even allow +you to upload several versions of a project- it's not Git, for sure, but even +basic version control is something. + +For example, [PyTorch Hub](https://pytorch.org/hub/) lets researchers publish +trained models developed in the PyTorch framework, along with code and papers. +It's easily searched and browsed, which makes projects discoverable. + +For a dataset analog, Kaggle is similar- they host user-submitted datasets and +help other users find them. Both PyTorch Hub and Kaggle have APIs for +programmatically downloading artifacts. + +| **Pros** | **Cons** | +| :----------------------: | :---------------------: | +| Browsable & discoverable | Centrally managed | +| Public | Public (no granularity) | +| Good with big models | Weak versioning support | + +# Method Three: Packaging with DVC + +[DVC](https://dvc.org), or Data Version Control, is a Python project for +extending Git version control to large project artifacts like datasets and +models. It's not a replacement for Git- DVC works _with_ Git! + +The basic idea is that your datasets and models are stored in a DVC repository, +which can be any cloud storage or server of your choice. DVC creates metadata +about file versions that can be tracked by Git and hosted on GitHub- so you can +share your datasets and models like any GitHub project, with all the benefits of +versioning. Let's look at a case study. + +## Creating a DVC project + +Say I have a project containing a dataset, model training code, and model. + +```dvc +$ ls +data.csv +train.py +model.pkl +``` + +Say our model and dataset are large and we want to track them with DVC. For +remote storage, we want to use a personal S3 bucket. We would run: + +```dvc +$ git init +$ dvc init +$ dvc remote add myremote s3://mybucket/myproject +$ dvc add data.csv model.pkl +$ dvc push +``` + +When I run these commands, I've initialized Git and DVC tracking. Next, I've set +a DVC repository- my S3 bucket. Then I've added `data.csv` and `model.pkl` to +DVC tracking. Finally, when I run `dvc push`, the model and dataset are pushed +to the S3 bucket. On my local machine, two meta-files are created: +`data.csv.dvc` and `model.pkl.dvc`. These can be tracked with Git! + +```dvc +$ ls +data.csv.dvc +train.py +model.pkl.dvc +``` + +So after setting a remote Git repository, `git add`, `commit` and `push` like +usual (assuming you are a regualr Git user, that is): + +```dvc +$ git remote add origin git@github.com:elle/myproject +$ git add . && git commit -m "first commit" +$ git push origin master +``` + +## Package management with DVC + +Now let's say one of my teammates wants to access my work so far- specifically, +they want to see if another method for constructing features from raw data will +help model accuracy. I've given them permission to access my GitHub repository. +On their local machine, they'll run: + +```dvc +$ dvc import https://github.com/elle/myproject data.csv model.pkl +``` + +This will download the latest version of the `data.csv` and `model.pkl` +artifacts to their local machine, as well as the DVC metafiles `data.csv.dvc` +and `model.pkl.csv` indicating the precise version and source. + +Collaborators can also download artifacts from previous versions, releases, or +parallel feature branches of a project. For example, if I released a new version +of my project with a Git tag (say `v.2.0.1`), collaborators can run + +```dvc +$ dvc get --rev v.2.0.1 \ + https://github.com/elle/myproject data.csv +``` + +Lastly, because `dvc import` maintains a link between the downloaded artifacts +and my repository, collaborators can check for project updates with + +```dvc +$ dvc update data.csv model.pkl +``` + +If new versions are detected, DVC automatically syncs the local workspace with +those versions. + +## When should you do this? + +In my own experience releasing a large public dataset with DVC, I've seen +several benefits: + +- Within an hour, someone found data points I'd been missing. It was + straightforward to make a new release after patching this error. +- Several people modeled my dataset! Highly rewarding. +- Since GitHub is a widely used platform for code sharing, it's a natural fit + for open source scientific projects and has little overhead for potential + collaborators + +To return to the pros and cons table: + +| **Pros** | **Cons** | +| :------------------------------------------------: | :-----------------------------------------------------: | +| Git version your dataset | No GUI access to files in DVC remote | +| Granular sharing permissions | Collaborators need to use DVC | +| DVC abstracts away download scripts/hardcoded URLs | Can be serverless, but you need to manage cloud storage | + +# The bottom line + +Packaging models and datasets is a non-trivial part of the machine learning +workflow. DVC provides a method for giving users a Git-centric experience of +cloning or forking these artifacts, with an emphasis on _versioning artifacts_ +and _abstracting away the processes of uploading, downloading, and storing +artifacts_. For projects with high complexity- like my hair project, which had +some gnarly dependencies and big artifacts- this kind of source control pays +off. If you don't know where your data came from or how it's been transformed, +it's impossible to be scientific. + +Thanks for stopping by our virtual poster! I'm happy to take questions or +comments about how version control fits into the scientific workflow. Leave a +comment, reach out on Twitter, or send an email. + +## Further reading + +_Check out our +[tutorial about creating a data registry](https://dvc.org/doc/use-cases/data-registries) +for more code examples._ diff --git a/content/blogs/2020-06-29-june-20-community-gems.md b/content/blogs/2020-06-29-june-20-community-gems.md new file mode 100644 index 0000000000..8a1d5fe52b --- /dev/null +++ b/content/blogs/2020-06-29-june-20-community-gems.md @@ -0,0 +1,163 @@ +--- +title: June '20 Community Gems +date: 2020-06-29 +description: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + migrating to DVC 1.0, the new pipeline format, and our Python API. +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + migrating to DVC 1.0, the new pipeline format, and our Python API. +picture: 2020-06-29/Gems_June_20.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/june-20-community-gems/426 +tags: + - Community Gems + - MinIO + - Pipelines + - Python + - Optimization +--- + +## Highlights from Discord + +Here are some Q&A's from our Discord channel that we think are worth sharing. + +### Q: I just upgraded to DVC 1.0. I've got some pipeline stages currently saved as `.dvc` files. [Is there an easy way to convert the old `.dvc` format to the new `dvc.yaml` standard?](https://discord.com/channels/485586884165107732/563406153334128681/725019219930120232) + +Yes! You can easily transfer the stages by hand: `dvc.yaml` is designed for +manual edits in any text editor, so you can type your old stages in and then +delete the old `.dvc` files. We also have a +[migration script](https://gist.github.com/skshetry/07a3e26e6b06783e1ad7a4b6db6479da) +available, although we can't provide long-term support for it. + +Learn more about the `dvc.yaml` format in our +[brand new docs](https://dvc.org/doc/user-guide/dvc-files#dvcyaml-file)! + +![Year Opening GIF](https://media.giphy.com/media/JYpTAnhT0EI2Q/giphy.gif) + +_Just like this but with technical documentation._ + +### Q: After I pushed my local data to remote storage, I noticed the file names are different in my storage repository- they're hash values. [Can I make them more meaningful names?](https://discord.com/channels/485586884165107732/563406153334128681/717737163122540585) + +No, but for a good reason! What you're seeing are cached files, and they're +stored with a special naming convention that makes DVC versioning and addressing +possible- these file names are how DVC deduplicates data (to avoid keeping +multiple copies of the same file version) and ensures that each unique version +of a file is immutable. If you manually overwrote those filenames you would risk +breaking Git version control. You can +[read more about how DVC uses this file format in our docs](https://dvc.org/doc/user-guide/dvc-internals#structure-of-cache-directory). + +It sounds like you're looking for ways to interact with DVC-tracked objects at a +high level of abstraction, meaning that you want to interface with the original +filenames and not the machine-generated hashes used by DVC. There are a few +secure and recommended ways to do this: + +- If you want to see a human-readable list of files that are currently tracked + by DVC, try the `dvc list` + command-[read up on it here](https://dvc.org/doc/command-reference/list). +- Check out our + [data registry tutorial](https://dvc.org/doc/use-cases/data-registries#data-registries) + to see how the commands `dvc get` and `dvc import` are used to download and + share DVC-tracked artifacts. The syntax is built for an experience like using + a package manager. +- The [DVC Python API](https://dvc.org/doc/api-reference) gives you programmatic + access to DVC-tracked artifacts, using human-readable filenames. + +### Q: [Is it better practice to `dvc add` data files individually, or to add a directory containing multiple data files?](https://discord.com/channels/485586884165107732/563406153334128681/722141190312689675) + +If the directory you're adding is logically one unit (for example, it is the +whole dataset in your project), we recommend using `dvc add` at the directory +level. Otherwise, add files one-by-one. You can +[read more about how DVC versions directories in our docs](https://dvc.org/doc/user-guide/dvc-internals#structure-of-cache-directory). + +### Q: [Do you have any examples of using DVC with MinIO?](https://discord.com/channels/485586884165107732/563406153334128681/722780202844815362) + +We don't have any tutorials for this use case exactly, but it's a very +straightforward modification from +[our basic use cases](https://dvc.org/doc/use-cases). The key difference when +using MinIO or a similar S3-compatible storage (like DigitalOcean Spaces or IBM +Cloud Object Storage) is that in addition to setting remote data storage, you +must set the `endpointurl` too. For example: + +```dvc +$ dvc remote add -d myremote s3://mybucket/path/to/dir +$ dvc remote modify myremote endpointurl https://object-storage.example.com +``` + +Read up on configuring supported storage +[in our docs](https://dvc.org/doc/command-reference/remote/add#supported-storage-types). + +### Q: [If I have a folder containing many data files, is there any advantage to zipping the folder and DVC tracking the `.zip`?](https://discord.com/channels/485586884165107732/563406153334128681/714922184455225445) + +There are a few things to consider: + +- **CPU time.** Even though it can be faster to pull a single file than a + directory (though not in all cases, since we can parallelize directory + downloads), the tradeoff is the time needed to unzip your data. Depending on + your constraints, this can be expensive and undesirable. + +- **Deduplication.** DVC deduplicates on the file level. So if you add one + single file to a directory, DVC will save only that file, not the whole + dataset again. If you use a zipped directory you won't get this benefit. In + the long run, this could be more expensive in terms of storage space for your + DVC cache and remote if the contents of your dataset change frequently. + +Generally, we would recommend first trying a plain unzipped directory. DVC is +designed to work with large numbers of files (on the order of millions) and has +the latest release (DVC 1.0) has +[optimizations built for this purpose exactly](https://dvc.org/blog/dvc-1-0-release#data-transfer-optimizations). + +### [Q: Can I execute a `dvc push` with the DVC Python API inside a Python script?](https://discord.com/channels/485586884165107732/485596304961962003/718419219288686664) + +Currently, our [Python API](https://dvc.org/doc/api-reference#python-api) +doesn't support commands like `dvc push`,`dvc pull`, or `dvc status`. It is +designed for interfacing with objects tracked by DVC. That said, CLI commands +are basically calling `dvc.repo.Repo` object methods. So if you want to use +commands from within Python code, you could try creating a `Repo` object with +`r = Repo({root_dir})` and then `r.push()`. Please note that we don't officially +support this use case yet. + +Of course, you can also run DVC commands from a Python script using `sys` or a +similar library for issuing system commands. + +### [Q: Does the `dvc pipeline` command for visualizing pipelines still work in DVC 1.0?](https://discord.com/channels/485586884165107732/485596304961962003/717682556203565127) + +Most of the `dvc pipeline` functionality- like `dvc pipeline show --ascii` to +print out an ASCII diagram of your pipeline- has been migrated to a new command, +`dvc dag`. This function is written for our new pipeline format. Check out +[our new docs](https://dvc.org/doc/command-reference/dag#dag) for an example. + +### [Q: Is there a way to create a DVC pipeline stage without running the commands in that stage?](https://discord.com/channels/485586884165107732/485596304961962003/715271980978405447) + +Yes. Say you have a Python script, `train.py`, that takes in a dataset `data` +and outputs a model `model.pkl`. To create a DVC pipeline stage corresponding to +this process, you could do so like this: + +```dvc +$ dvc run -n train + -d train.py -d data + -o model.pkl + python train.py +``` + +However, this would automatically rerun the command `python train.py`, which is +not necessarily desirable if you have recently run it, the process is time +consuming, and the dependencies and outputs haven't changed. You can use the +`--no-exec` flag to get around this: + +```dvc +$ dvc run --no-exec + -n train + -d train.py -d data + -o model.pkl + python train.py +``` + +This flag can also be useful when you want to define the pipeline on your local +machine but plan to run it later on a different machine (perhaps an instance in +the cloud). +[Read more about the `--no-exec` flag in our docs.](https://dvc.org/doc/command-reference/run) + +One other approach worth mentioning is that you can manually edit your +`dvc.yaml` file to add a stage. If you add a stage this way, pipeline commands +won't be executed until you run `dvc repro`. diff --git a/content/blogs/2020-07-07-cml-release.md b/content/blogs/2020-07-07-cml-release.md new file mode 100644 index 0000000000..0181166f01 --- /dev/null +++ b/content/blogs/2020-07-07-cml-release.md @@ -0,0 +1,188 @@ +--- +title: 'New Release: Continuous Machine Learning (CML) is CI/CD for ML' +date: 2020-07-07 +description: > + Today we're launching Continuous Machine Learning (CML), a new open-source + project for CI/CD with ML. Let's bring the power of DevOps to ML or MLOps. +descriptionLong: > + Today we're launching Continuous Machine Learning (CML), a new open-source + project for CI/CD with ML. Use it to automate parts of your ML workflow, + including model training and evaluation, comparing ML experiments across your + project history, and monitoring changing datasets. Let's bring the power of + DevOps to ML or MLOps. +picture: 2020-07-07/cover.png +pictureComment: CML release +author: dmitry_petrov +commentsUrl: https://discuss.dvc.org/t/continuous-machine-learning-release/429 +tags: + - Release + - CI/CD + - MLOps + - DataOps +--- + +## CI/CD for machine learning + +Today, the DVC team is releasing a new open-source project called Continuous +Machine Learning, or CML (https://cml.dev) to mainstream the best engineering +practices of CI/CD to AI and ML teams. CML helps to organize MLOps +infrastructure on top of the traditional software engineering stack instead of +creating separate AI platforms. + +Continuous integration and continuous delivery (CI/CD) is a widely-used software +engineering practice. It's a validated approach to increasing the agility of +software development without sacrificing stability. **But why haven't CI/CD +practices taken root in machine learning and data science so far?** + +We see three substantial technical barriers to using standard CI systems with +machine learning projects: + +1. **Data dependencies.** In ML, data plays a similar role as code: ML results + critically depend on datasets, and changes in data need to trigger feedback + just like changes in source code. Furthermore, multi-GB datasets are + challenging to manage with Git-centric CI systems. +2. **Metrics-driven.** The traditional software engineering idea of pass/fail + tests does not apply in ML. As an example, `+0.72% accuracy` and + `-0.35% precision` does not answer the question if the ML model is good or + not. Detailed reports with metrics and plots are needed to make a good/bad + model discussion +3. **CPU/GPU resources**. ML training often requires more resources to train + then is typical to have in CI/CD runners. CI/CD must be connected with cloud + computing instances or Kubernetes clusters for ML training. + +## CI/CD for ML is the next step for the DVC team + +Since the beginning, our motivation has been helping ML teams benefit from +DevOps. We started DVC because we knew that data management would be a crucial +bottleneck, and sure enough, DVC was a big step towards making pipelines and +experiments manageable and reproducible. But conversations with our community +have brought us to one conclusion again and again: CI/CD for ML is the holy +grail. + +Over the last 3 years, we've reached some big milestones: + +1. We built DVC to address the ML data management problem. Recently, we + [released DVC 1.0](https://dvc.org/blog/dvc-1-0-release), marking a new and + more stable era for our API. +2. DVC has become a core part of many ML team's daily operations. The latest + [ThoughtWorks Technology Radar](https://www.thoughtworks.com/radar/tools) + says: + + _"... it [DVC] has become a favorite tool for managing experiments in machine + learning (ML) projects. Since it's based on Git, DVC is a familiar + environment for software developers to bring their engineering practices to + ML practice."_ + +3. An extraordinary team and community have emerged around DVC: + - 15 employees in our organization https://iterative.ai + - 100+ open-source contributors to DVC https://github.com/iterative/dvc and + another 100+ open-source contributors to docs + https://github.com/iterative/dvc.org + - 2000+ community members in our Discord https://dvc.org/chat and GitHub + issue tracker https://github.com/iterative/dvc + - 4000+ regular users of DVC + +Now that DVC is maturing, we're ready to take the next step: we want to +revolutionize the ML development processes. We want ML experiments to have +greater visibility to teammates, shorter feedback loops, and more +reproducibility. We want teams to spend less time managing their computing +resources and experiments, and more time building value. The goal is to extend +the amazing results of DevOps from software development to ML and MLOps. + +## _Continuous Machine Learning_ release + +Today, we're releasing an open-source project https://cml.dev to close the gap +between machine learning and software development practices. + +CML is a library of functions used inside CI/CD runners to make ML compatible +with **GitHub Actions** and **GitLab CI**. We've created functions to: + +1. Generate informative reports on every Pull/Merge Request with metrics, plots, + and hyperparameters changes. +2. Provision GPU\CPU resources from cloud service providers (**AWS, GCP, Azure, + Ali**) and deploy CI runners using + [Docker Machine](https://github.com/docker/machine). +3. Bring datasets from cloud storage to runners (using **DVC**) for model + training, as well as save the resulting model in cloud storage. + +![Auto-generated metrics-driven report in GitLab Merge Request](../uploads/images/2020-07-07/cml-report-metrics.png) + +The workflow and visual reports are customizable by modifying the CI +configuration file in your GitHub `./github/workflows/*.yaml` or GitLab +`.gitlab-ci.yml` project. Use CML functions in conjunction with your own ML +model training and testing scripts to create your own automated workflow and +reporting system. + +```yaml +# GitLab workflow in '.gitlab-ci.yml' file + +stages: + - cml_run + +cml: + stage: cml_run + image: iterativeai/cml:0-dvc2-base1 + script: + - dvc pull data --run-cache + + - pip install -r requirements.txt + - dvc repro + + # Compare metrics to master + - git fetch --prune + - dvc metrics diff --show-md master >> report.md + + # Visualize loss function diff + - dvc plots diff --target loss.csv --show-vega master > vega.json + - vl2png vega.json > plot.png + - cml publish --md plot.png >> report.md + - dvc push data --run-cache + - cml send-comment report.md +``` + +![Hyperparameter change with a result image in GitHub Pull request report](../uploads/images/2020-07-07/cml-report-params.png) + +In this example all the CML functions are defined in the **docker images** that +is used in the workflow - `iterativeai/cml:0-dvc2-base1`. Users can specify any +docker image. The only restriction is that the CML library need to be installed +to enable all the CML commands for the reporting and graphs: + +```bash +npm i @dvcorg/cml +``` + +Examples of docker images can be found in `docker` directory of the CML the +repository: [CML repository](https://github.com/iterative/cml). + +As you can see, CML is based on the assumption that MLOps can work with +traditional engineering tools. It shouldn't require an entirely separate +platform. We're excited about a world where DevOps practitioners can work +fluently on both software and ML aspects of a project. + +## The relationship between CML and DVC + +CML and DVC are related projects under the umbrella of the same team, but will +have separate websites and independent development. The CML project is hosted on +a new web site: https://cml.dev. The source code and issue tracker is on GitHub: +https://github.com/iterative/cml + +For support and communications, the DVC Discord server is still the place to go: +https://dvc.org/chat We've made a new `#cml` channel there to discuss CML, CI/CD +for ML and other MLOps related questions. + +## Conclusion + +With the rise of AI/ML teams and ML platforms in addition to the software +engineering stack, we believe that the industry needs a single technology stack +to work with software as well as AI projects. A simple layer of a tool is +required to close the gap between AI projects and software projects to fit them +into the existing stack and CML is the way to make it. + +Our philosophy is that ML projects, and MLOps practices, should be built on top +of traditional engineering tools and not as a separate stack. A simple layer of +tools will be required to close the gap, and CML is part of this ecosystem. We +think this is the future of MLOps. + +As always, thanks for reading and for being part of the DVC community. We'd love +to hear what you think about CML. Please be in touch on +[Twitter](https://twitter.com/dvcorg) and [Discord](https://dvc.org/chat)! diff --git a/content/blogs/2020-07-10-july-20-dvc-heartbeat.md b/content/blogs/2020-07-10-july-20-dvc-heartbeat.md new file mode 100644 index 0000000000..791520bc45 --- /dev/null +++ b/content/blogs/2020-07-10-july-20-dvc-heartbeat.md @@ -0,0 +1,195 @@ +--- +title: July ’20 Heartbeat +date: 2020-07-10 +description: > + Catch up on new DVC releases, talks, and projects in our community. This + month, we recap the DVC 1.0 release, making the list of top 20 fastest growing + open-source startups, and interviews galore. Plus: 📣 an invitation to the + next DVC meetup! +descriptionLong: > + Catch up on new DVC releases, talks, and projects in our community. This + month, we recap the DVC 1.0 release, making the list of top 20 fastest growing + open-source startups, and interviews galore. Plus: 📣 an invitation to the + next DVC meetup! +picture: 2020-07-10/july_20_heartbeat_header.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/july-20-dvc-heartbeat/439 +tags: + - Heartbeat + - CI/CD + - DVC 1.0 + - SciPy + - MLOps + - Reproducibility + - Meetup +--- + +Welcome to the July Heartbeat, our monthly roundup of [new releases](#news), +[talks](#community-activity), [great articles](#good-reads), and +[upcoming events](#coming-up-soon) in the DVC community. + +## News + +### DVC 1.0 release + +On June 22, DVC entered a new era: the +[official release of version 1.0](https://dvc.org/blog/dvc-1-0-release). After +several weeks of bug-catching with our pre-release, the team has issued DVC 1.0 +for the public! Now when you +[install DVC through your package manager of choice](https://dvc.org/doc/install), +you'll get the latest version. Welcome to the future. + +To recap, DVC 1.0 has some big new features like: + +- Plots powered by Vega-Lite so you can compare metrics across commits +- New and easier pipeline configuration files- edit your DVC pipeline like a + text file! +- Optimizations for data transfer speed + +Read all the [release notes](https://dvc.org/blog/dvc-1-0-release) for more, and +stop by our [Discord](https://discordapp.com/invite/dvwXA2N) if you need support +migrating (don't worry, 1.0 is backwards compatible). + +### Virtual meetup! + +In May, we had our [first every virtual meetup](/blog/may-20-dvc-heartbeat). We +had amazing talks from [Dean Pleban](https://twitter.com/DeanPlbn) and +[Elizabeth Hutton](https://github.com/ehutt), plus time for Q&A with the DVC +team- you can +[watch the recording](https://www.youtube.com/watch?v=19GMtrFykSU&list=PLVeJCYrrCemiOc1SS_PIB3Tb3HX0Aqw3j) +if you missed it! + +On Thursday, July 30, we're hosting our second meetup! Ambassador +[Marcel Ribeiro-Dantas](http://mribeirodantas.me/) is hosting once again. We'll +have short talks about causal modeling and CI/CD, plus lots of time for chatting +and catching up. Please RSVP! + +

July DVC Meetup: Data Science & DevOps!

This meetup will be hosted by DVC Ambassador Marcel! AGENDA:We have two 10-minute talks on the agenda:- Causal Modeling with DVC - Marcel- Continuous integration for ML case studies - Elle Following talks, we'll have Q&A with the DVC team and time for community discussion.

+ + +### DVC is in the top 20 fastest-growing open source startups + +Konstantin Vinogradov at [Runa Capital](https://runacap.com/) used the GitHub +API to +[identify the fastest growing public repositories on GitHub](https://medium.com/runacapital/open-source-growth-benchmarks-and-the-20-fastest-growing-oss-startups-d3556a669fe6) +in terms of stars and forks. He used these metrics to estimate the top 20 +fastest growing startups in open source software. And guess what, DVC made the +cut! We're in great company. + +![](../uploads/images/2020-07-10/top20startups.png) + +### New team member + +We have a new teammate-[Maxim Shmakov](https://www.linkedin.com/in/mvshmakov/), +previously of Yandex, is joining us! Maxim is a front-end engineer joining us +from Moscow. Please welcome him to DVC. 👋 + +## Community activity + +We've been busy! Although we are mostly homebound these days, there has been no +shortage of speaking engagements. Here's a recap. + +### Meetings and talks + +- Co-founders Dmitry and Ivan appeared on the HasGeek TV series + [Making Data Science Work](https://hasgeek.com/fifthelephant/making-data-science-work-session-3/) + to discuss engineering for data science with hosts + [Venkata Pingali](https://www.linkedin.com/in/pingali/) and + [Indrayudh Ghoshal](https://www.linkedin.com/in/indrayudhghoshal/). The + livestream is available for viewing on YouTube! + +https://www.youtube.com/watch?v=EWcpALbzZRg + +- Dmitry appeared on the [MLOps.community](https://mlops.community/) meetup to + chat with host [Demetrios Brinkmann](https://www.linkedin.com/in/dpbrinkm/). + They talked about the open source ecosystem, the difference between tools and + platforms, and what it means to codify data. + +https://www.youtube.com/watch?v=ojV1tK9jXH8&t=2295s + +- I (Elle) gave a talk at the + [MLOps Production & Engineering World](https://mlopsworld.com/) meeting, + called "Adapting continuous integration and continuous delivery for ML". I + shared an approach to using GitHub Actions with ML projects. Video coming + soon! + +https://twitter.com/TMLS_TO/status/1273693487104503808 + +- Extremely early the next morning, clinician-scientist + [Cris Lanting](https://www.linkedin.com/in/crislanting/?originalSubdomain=nl) + and I co-led a workshop about developing strong computational infrastructure + and practices in research as part of the + [Virtual Conference on Computational Audiology](https://computationalaudiology.com/). + We talked about big ideas for making scientific research reproducible, + manageable, and shareable. For the curious, the workshop is still viewable! + +https://www.youtube.com/watch?v=W4CoptalWw0 + +- DVC has a virtual poster at [SciPy 2020](https://www.scipy2020.scipy.org/)! We + prepared a demo about + [packaging models and datasets like software](https://dvc.org/blog/scipy-2020-dvc-poster) + so they can be widely disseminated via GitHub. + +### Good reads + +Some excellent reading recommendations from the community: + +- Data scientist Déborah Mesquita published a thorough guide to using new DVC + 1.0 pipelines in a sample ML project. It's truly complete, covering data + collection to model evaluation, with detailed code examples. If you are new to + pipelines, do not miss this! + + + +- Caleb Kaiser of [Cortex](https://github.com/cortexlabs/cortex) (another + startup in the Runa Capital's Top 20 list!) shared a thinkpiece about + challenges from software engineering that can inform production ML. We really + agree with what he has to say about reproducibility: + +> You typically hear about “reproducibility” in reference to ML research, +> particularly when a paper doesn’t include enough information to recreate the +> experiment. However, reproducibility also comes up a lot in production ML. +> Think of it this way — you’re on a team staffed with data scientists and +> engineers, and you’re all responsible for an image classification API. The +> data scientists are constantly trying new techniques and architectural tweaks +> to improve the model’s baseline performance, while at the same time, the model +> is constantly being retrained on new data. Looking over the APIs performance, +> you see one moment a week ago where the model’s performance dropped +> significantly. What caused that drop? Without knowing exactly how the model +> was trained, and on what data, it’s impossible to know for sure. + + + +- Mukul Sood wrote about the Real World, a place beyond Jupyter notebooks where + data is non-stationary and servers are unreliable! He covers some very real + challenges for taking a data science project into production and introduces + the need for CI/CD practices in healthy, scalable ML applications. + + + +### A nice tweet + +We'll close on a nice tweet from [Russell Jurney](https://datasyndrome.com/): + +https://twitter.com/rjurney/status/1266735603921547264 + +Thanks, we couldn't do it without our community! As always, thanks for joining +us and reading. There are lots of ways to stay in touch and we always love to +hear from you. Follow us on [Twitter](twitter.com/dvcorg), join our +[Discord server](https://discordapp.com/invite/dvwXA2N), or leave a blog +comment. Until next time! 😎 diff --git a/content/blogs/2020-07-16-devops-for-data-scientists.md b/content/blogs/2020-07-16-devops-for-data-scientists.md new file mode 100644 index 0000000000..1ff8173ebe --- /dev/null +++ b/content/blogs/2020-07-16-devops-for-data-scientists.md @@ -0,0 +1,379 @@ +--- +title: What data scientists need to know about DevOps +date: 2020-07-16 +description: > + A philosophical and practical guide to using continuous integration (via + GitHub Actions) to build an automatic model training system. +picture: 2020-07-16/unicorn_floatie.jpg +pictureComment: | + The unicorn! A mythical data scientist who can code, write unit tests + AND resist the lure of a deep neural network when logistic regression + will do. + + Photo by [James Lee](https://unsplash.com/@picsbyjameslee) via + [Unsplash](https://unsplash.com/photos/qSf_4bNsoWc). +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/what-data-scientists-need-to-know-about-devops/447 +tags: + - GitHub Actions + - MLOps + - CI/CD + - Cloud training + - CML + - Company + - Tutorial +--- + +With the rapid evolution of machine learning (ML) in the last few years, it’s +become +[trivially easy to begin ML experiments](https://towardsdatascience.com/deep-learning-isnt-hard-anymore-26db0d4749d7). +Thanks to libraries like [scikit-learn](https://scikit-learn.org/stable/) and +[Keras](https://github.com/keras-team/keras), you can make models with a few +lines of code. + +But it’s harder than ever to turn data science projects into meaningful +applications, like a model that informs team decisions or becomes part of a +product. The typical ML project involves +[so many distinct skill sets](https://ieeexplore.ieee.org/abstract/document/8804457) +that it’s challenging, if not outright impossible, for any one person to master +them all — so hard, the rare data scientist who can also develop quality +software and play engineer is called a unicorn! + +As the field matures, a lot of jobs are going to require a mix of software, +engineering, and mathematical chops. Some say +[they](https://www.anaconda.com/state-of-data-science-2020?utm_medium=press&utm_source=anaconda&utm_campaign=sods-2020&utm_content=report) +[already](http://veekaybee.github.io/2019/02/13/data-science-is-different/) +[do](https://tech.trivago.com/2018/12/03/teardown-rebuild-migrating-from-hive-to-pyspark/). + +To quote the unparalleled data scientist/engineer/critical observer Vicki Boykis +in her blog +[Data science is different now](http://veekaybee.github.io/2019/02/13/data-science-is-different/): + +> What is becoming clear is that, in the late stage of the hype cycle, data +> science is asymptotically moving closer to engineering, and the +> [skills that data scientists need](https://www.youtube.com/watch?v=frQeK8xo9Ls) +> moving forward are less visualization and statistics-based, and +> [more in line with traditional computer science curricula](https://tech.trivago.com/2018/12/03/teardown-rebuild-migrating-from-hive-to-pyspark/). + +## Why data scientists need to know about DevOps + +So which of the many, many engineering and software skills should data +scientists learn? My money is on DevOps. DevOps, a portmanteau of development +and operations, was officially born in 2009 +[at a Belgian conference](https://en.wikipedia.org/wiki/DevOps#History). The +meeting was convened as a response to tensions between two facets of tech +organizations that historically experienced deep divisions. Software developers +needed to move fast and experiment often, while Operations teams prioritized +stability and availability of services (these are the people who keep servers +running day in and day out). Their goals were not only opposing, they were +competing. + +That sounds awfully reminiscent of today’s data science. Data scientists create +value by experiments: new ways of modeling, combining, and transforming data. +Meanwhile, the organizations that employ data scientists are incentivized for +stability. + +The consequences of this division are profound: in the +[latest Anaconda “State of Data Science” report](https://www.globenewswire.com/news-release/2020/06/30/2055578/0/en/Anaconda-Releases-2020-State-of-Data-Science-Survey-Results.html), +“fewer than half (48%) of respondents feel they can demonstrate the impact of +data science” on their organization. By some estimates, the vast majority of +[models created by data scientists end up stuck on a shelf](https://venturebeat.com/2019/07/19/why-do-87-of-data-science-projects-never-make-it-into-production/). +We don’t yet have strong practices for passing models between the teams that +create them and the teams that deploy them. Data scientists and the developers +and engineers who implement their work have entirely different tools, +constraints, and skill sets. + +DevOps emerged to combat this sort of deadlock in software, back when it was +developers vs. operations. And it was tremendously successful: +[many](http://engineering.microsoft.com/devops/) +[teams](https://insights.sei.cmu.edu/devops/2015/02/devops-case-study-amazon-aws.html) +have gone from deploying new code every few months to several times a day. Now +that we have machine learning vs. operations, it’s time to think about MLOps — +principles from DevOps that work for data science. + +## Introducing Continuous Integration + +DevOps is both a philosophy and a set of practices, including: + +1. Automate everything you can + +2. Get feedback on new ideas fast + +3. Reduce manual handoffs in your workflow + +In a typical data science project, we can see some applications: + +1. **Automate everything you can.** Automate parts of your data processing, + model training, and model testing that are repetitive and predictable. + +2. **Get feedback on new ideas fast.** When your data, code, or software + environment changes, test it immediately in a production-like environment + (meaning, a machine with the dependencies and constraints you anticipate + having in production). + +3. **Reduce manual handoffs in your workflow.** Find opportunities for data + scientists to test their own models as much as possible. Don’t wait until a + developer is available to see how the model will behave in a production-like + environment. + +The standard DevOps approach for accomplishing these goals is a method called +continuous integration (CI). + +The gist is that when you change a project’s source code (usually, changes are +registered via git commits), your software is automatically built and tested. +Every action triggers feedback. CI is often used with +[Git-flow](https://nvie.com/posts/a-successful-git-branching-model/), a +development architecture in which new features are built on Git branches (need a +Git refresher? +[Try this](https://towardsdatascience.com/why-git-and-how-to-use-git-as-a-data-scientist-4fa2d3bdc197)). +When a feature branch passes the automated tests, it becomes a candidate to be +merged into the master branch. + +![](../uploads/images/2020-07-16/basic_ci_system.png) _Here's what continuous +integration looks like in software development._ + +With this setup, we have automation — code changes trigger an automatic build +followed by testing. We have fast feedback, because we get test results back +quickly, so the developer can keep iterating on their code. And because all this +happens automatically, you don’t need to wait for anyone else to get feedback — +one less handoff! + +_So why don’t we use continuous integration already in ML?_ Some reasons are +cultural, like a low crossover between data science and software engineering +communities. Others are technical- for example, to understand your model’s +performance, you need to look at metrics like accuracy, specificity, and +sensitivity. You might be assisted by data visualizations, like a confusion +matrix or loss plot. So pass/fail tests won’t cut it for feedback. Understanding +if a model is improved requires some domain knowledge about the problem at hand, +so test results need to be reported in an efficient and human-interpretable way. + +![](../uploads/images/2020-07-16/ci_for_data_system.png) _Here's what continuous +integration might look like in a machine learning project. Inspected by Data +Science Doggy._ + +## How do CI systems work? + +Now we’ll get even more practical. Let’s take a look at how a typical CI system +works. Luckily for learners, the barrier has never been lower thanks to tools +like GitHub Actions and GitLab CI- they have clear graphical interfaces and +excellent docs geared for first-time users. Since GitHub Actions is completely +free for public projects, we’ll use it for this example. It works like this: + +1. You create a GitHub repository. You create a directory called + `.github/workflows`, and inside, you place a special `.yaml` file with a + script you want to run- like, + +```dvc +$ python train.py +``` + +2. You change the files in your project repository somehow and Git commit the + change. Then, push to your GitHub repository. + +```dvc +# Create a new git branch for experimenting +$ git checkout -b "experiment" +$ edit train.py + +# git add, commit, and push your changes +$ git add . && commit -m "Normalized features" +$ git push origin experiment +``` + +3. As soon as GitHub detects the push, GitHub deploys one of their computers to + run the functions in your `.yaml`. + +4. GitHub returns a notification if the functions ran successfully or not. + +![](../uploads/images/2020-07-16/run_notification.png) _Find this in the Actions +tab of your GitHub repository._ + +That’s it! What’s really neat here is that you’re using GitHub’s computers to +run your code. All you have to do is update your code and push the change to +your repository, and the workflow happens automatically. + +Back to that special `.yaml` file I mentioned in Step 1- let’s take a quick look +at one. It can have any name you like, as long as the file extension is `.yaml` +and it’s stored in the directory `.github/workflows`. Here’s one: + +```yaml +# .github/workflows/ci.yaml +name: train-my-model +on: [push] +jobs: + run: + runs-on: [ubuntu-latest] + steps: + - uses: actions/checkout@v2 + - name: training + run: | + pip install -r requirements.txt + python train.py +``` + +There’s a lot going on, but most of it is the same from Action to Action- you +can pretty much copy and paste this standard GitHub Actions template, but fill +in your workflow in the `run` field. + +If this file is in your project repo, whenever GitHub detects a change to your +code (registered via a push), GitHub Actions will deploy an Ubuntu runner and +attempt to execute your commands to install requirements and run a Python +script. Be aware that you have to have the files required for your workflow — +here, `requirements.txt` and `train.py` — in your project repo! + +## Get better feedback + +As we alluded to earlier, automatic training is pretty cool and all, but it’s +important to have the results in a format that’s easy to understand. Currently, +GitHub Actions gives you access to the runner’s logs, which are plain text. + +![](../uploads/images/2020-07-16/github_actions_log.png) _An example printout +from a GitHub Actions log._ + +But understanding your model’s performance is tricky. Models and data are high +dimensional and often behave nonlinearly — two things that are especially hard +to understand without pictures! + +I can show you one approach for putting data viz in the CI loop. For the last +few months, my team at Iterative.ai has been working on a toolkit to help use +GitHub Actions and GitLab CI for machine learning projects. It’s called +[Continuous Machine Learning](https://cml.dev) (CML for short), and it’s open +source and free. + +Working from the basic idea of, “Let’s use GitHub Actions to train ML models,”, +we’ve built some functions to give more detailed reports than a pass/fail +notification. CML helps you put images and tables in the reports, like this +confusion matrix generated by SciKit-learn: + +![](../uploads/images/2020-07-16/cml_basic_report.png) _This report appears when +you make a Pull Request in GitHub!_ + +To make this report, our GitHub Action executed a Python model training script, +and then used CML functions to write our model accuracy and confusion matrix to +a markdown document. Then CML passed the markdown document to GitHub. + +Our revised `.yaml` file contains the following workflow: + +```yaml +name: train-my-model +on: [push] +jobs: + run: + runs-on: [ubuntu-latest] + container: iterativeai/cml:0-dvc2-base1 + steps: + - uses: actions/checkout@v2 + - name: training + env: + repo_token: ${{ secrets.GITHUB_TOKEN }} + run: | + # train.py outputs metrics.txt and plot.png + pip3 install -r requirements.txt + python train.py + + # copy the contents of metrics.txt to our markdown report + cat metrics.txt >> report.md + + # add our confusion matrix to report.md + cml publish plot.png --md >> report.md + + # send the report to GitHub for display + cml send-comment report.md +``` + +You can see the entire +[project repository here](https://github.com/iterative/cml_base_case). Note that +our .yaml now contains a few more configuration details, like a special Docker +container and an environmental variable, plus some new code to run. The +container and environmental variable details are standard in every CML project, +not something the user needs to manipulate, so focus on the code! + +With the addition of these CML functions to the workflow, we’ve created a more +complete feedback loop in our CI system: + +1. Make a Git branch and change your code on that branch. + +2. Automatically train model and produce metrics (accuracy) and a visualization + (confusion matrix). + +3. Embed those results in a visual report in your Pull Request. + +Now, when you and your teammates are deciding if your changes have a positive +effect on your modeling goals, you have a dashboard of sorts to review. Plus, +this report is linked by Git to your exact project version (data and code) AND +the runner used for training AND the logs from that run. Very thorough! No more +graphs floating around your workspace that have long ago lost any connection to +your code! + +So that’s the basic idea of CI in a data science project. To be clear, this +example is among the simplest way to work with CI. In real life, you’ll likely +encounter considerably more complex scenarios. CML also has features to help you +use large datasets stored outside your GitHub repository (using DVC) and train +on cloud instances, instead of the default GitHub Actions runners. That means +you can use GPUs and other specialized setups. + +For example, I made a project using GitHub Actions to deploy an +[EC2 GPU and then train a neural style transfer model](https://github.com/iterative/cml_cloud_case). +Here’s my CML report: + +![](../uploads/images/2020-07-16/cloud_report.png) _Training in the cloud! +Weeeeeee!_ + +You can also use your own Docker containers, so you can closely emulate the +environment of a model in production. I’ll be blogging more about these advanced +use cases in the future. + +## Final thoughts on CI for ML + +To summarize what we’ve said so far: + +**DevOps is not a specific technology, but a philosophy and a set of principles +and practices for fundamentally restructuring the process of creating +software.** It’s effective because it **addresses systemic bottlenecks** in how +teams work and experiment with new code. + +As data science matures in the coming years, people who understand how to apply +DevOps principles to their machine learning projects will be a valuable +commodity — both in terms of salary and their organizational impact. Continuous +integration is a staple of DevOps and one of the most effective known methods +for building a culture with reliable automation, fast testing, and autonomy for +teams. + +CI can be implemented with systems like +[GitHub Actions](https://github.com/features/actions) or +[GitLab CI](https://about.gitlab.com/stages-devops-lifecycle/continuous-integration/), +and you can use these services to build automatic model training systems. The +benefits are numerous: + +1. Your code, data, models, and training infrastructure (hardware and software + environment) are Git versioned. + +2. You’re automating work, testing frequently and getting fast feedback (with + visual reports if you use CML). In the long run, this will almost certainly + speed up your project’s development. + +3. CI systems make your work is visible to everyone on your team. No one has to + search very hard to find the code, data, and model from your best run. + +And I promise, once you get into the groove, it is incredibly fun to have your +model training, recording, and reporting automatically kicked off by a single +git commit. + +You will feel so cool. + +![Pixel Illustration GIF by Walter Newton](https://media.giphy.com/media/26AHG5KGFxSkUWw1i/giphy.gif) + +### Further reading + +- [Continuous Integration](https://www.martinfowler.com/articles/continuousIntegration.html), + the seminal Martin Fowler blog on the subject + +- [Continuous Delivery for Machine Learning](https://martinfowler.com/articles/cd4ml.html), + another excellent blog on Martin Fowler’s site about building a continuous + integration & continuous delivery system for ML + +- [The DevOps Handbook](https://www.amazon.com/DevOps-Handbook-Second-World-Class-Organizations/dp/B09L56CT6N), + a beloved guide that is recommended for nearly any organization (ML, software, + or not) + +_**Note:** This article has been cross-posted on Medium._ diff --git a/content/blogs/2020-07-22-july-20-community-gems.md b/content/blogs/2020-07-22-july-20-community-gems.md new file mode 100644 index 0000000000..37a809e055 --- /dev/null +++ b/content/blogs/2020-07-22-july-20-community-gems.md @@ -0,0 +1,192 @@ +--- +title: July '20 Community Gems +date: 2020-07-31 +description: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + getting started with CML, configuring your DVC cache, and how to request a + tutorial video. +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + getting started with CML, configuring your DVC cache, and how to request a + tutorial video. +picture: 2020-07-31/Gems_July_20.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/july-20-community-gems/460 +tags: + - Community Gems + - CML + - GCP + - DVC 1.0 +--- + +Here are some of our top Q&A's from around the community. With the launch of +[CML](https://cml.dev) earlier in the month, we've got some new ground to cover! + +## DVC questions + +### [Q: Recently, I set up a global DVC remote. Where can I find the config file?](https://discordapp.com/channels/485586884165107732/563406153334128681/717673618217238598) + +When you +[create a global DVC remote](https://dvc.org/doc/command-reference/remote/list#options), +a config file will be created in `~/.config/dvc/config` instead of your project +directory (i.e., `.dvc/config`). + +Note that on a Windows system, the config file will be created at +`C:\Users\\AppData\Local\iterative\dvc\config`. + +### [Q: I'm working on a collaborative project, and I use `dvc pull` to sync my local workspace with the project repository. Then, I try running `dvc repro`, but get an error: `dvc.yaml does not exist`. No one else on my team is having this issue. Any ideas?](https://discordapp.com/channels/485586884165107732/485596304961962003/731188065078345799) + +This error suggests there is no `dvc.yaml` file in your project. Most likely, +this means your teammates are using DVC version 0.94 or earlier, before the +`dvc.yaml` standard was introduced. Meanwhile, it sounds like you're using +version 1.0 or later. You can check by running + +```dvc +$ dvc version +``` + +The best solution is for your whole team to upgrade to the latest version- and +there's an easy +[migration script to help you make the move](https://towardsdatascience.com/automatically-migrate-your-project-from-dvc-0-94-to-dvc-1-x-416a5b9e837b). +If for some reason this won't work for your team, you can either downgrade to a +previous version, or use a workaround: + +```dvc +$ dvc repro <.dvc file> +``` + +substituting the appropriate `.dvc` file for your pipeline. DVC 1.0 is backwards +compatible, so pipelines created with previous versions will still run. + +### [Q: Does the DVC installer for Windows also include the dependencies for using cloud storage, like S3 and GCP?](https://discordapp.com/channels/485586884165107732/485596304961962003/715717911574216735) + +If you're installing DVC from binary-such as the `dvc.exe` +[downloadable on the DVC homepage](https://dvc.org/)- all the standard +dependencies are included. You shouldn't need to use `pip` to install extra +packages (like `boto` for S3 storage). + +### [Q: Is there a way to setup my DVC remote so I can manually download files from it without going through DVC?](https://discordapp.com/channels/485586884165107732/563406153334128681/717458695709130764) + +When DVC adds a file to a remote repository (such as an S3 bucket, or an SSH +file server), there's only one change happening: DVC calculates an md5 for the +file and renames it with that md5. In technical terms, it's storing files in a +"content-addressable way". That means if you know the hash of a file, you can +locate it in your DVC remote and manually download it. + +To find the hash for a given file, say `data.csv`, you can look in the +corresponding DVC file: + +```dvc +$ cat data.csv.dvc +``` + +Another approach is using a built-in DVC function: + +```dvc +$ dvc get --show-url . data.csv +``` + +You can read more about `dvc get --show-url` in +[our docs](https://dvc.org/doc/command-reference/get#options). Note that this +functinality is also part of our Python API, so you can locate the path to a +file in your remote within a Python environment. +[Check out our API docs!](https://dvc.org/doc/api-reference/get_url) + +### [Q: By default, each DVC project has its own cache in the project repository. To save space, I'm thinking about locally creating a single cache folder and letting multiple project repositories point there. Will this work?](https://discordapp.com/channels/485586884165107732/563406153334128681/736164141701791815) + +Yes, we hear from many users who have created a +[shared cache](https://dvc.org/doc/user-guide/how-to/share-a-dvc-cache#configure-the-shared-cache). +Because of the way DVC uses content-addressable filenames, you won't encounter +issues like accidentally overwriting files from one project with another. + +A possible issue is that a shared cache will grant all teammates working on a +given project access to the data from all other projects using that cache. If +you have sensitive data, you can create different caches for projects involving +private and public data. + +To learn more about setting your cache directory location, +[see our docs](https://dvc.org/doc/command-reference/cache/dir). + +## CML questions + +### Q: I use Bitbucket. Will CML work for me? + +The first release of CML is compatible with GitHub and GitLab. We've seen +[many requests for Bitbucket support](https://github.com/iterative/cml/issues/140), +and we're actively investigating how to add this. Stay tuned. + +### [Q: I have on-premise GPUs. Can CML use them to execute pipelines?](https://discordapp.com/channels/485586884165107732/728693131557732403/730070747388706867) + +Yep! You can use on-premise compute resources by configuring them as self-hosted +runners. See +[GitHub](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners) +and [GitLab](https://docs.gitlab.com/runner/)'s official docs for more details +and setup instructions. + +### [Q: I'm building a workflow that deploys a GCP Compute Engine instance, but I can only find examples with AWS EC2 in the CML docs. What do I do?](https://discordapp.com/channels/485586884165107732/728693131557732403/730688592787275806) + +There is a slight difference in the way CML handles credentials for AWS and GCP, +and that means you'll have to modify your workflow file slightly. We've added an +example workflow for GCP to our +[project README](https://github.com/iterative/cml#allocating-cloud-resources-with-cml). + +We've updated our +[cloud compute use case repository docs](https://github.com/iterative/cml_cloud_case#using-a-different-cloud-service) +to cover a GCP example. + +Note that for Azure, the workflow will be the same as for AWS. You'll only have +to change the arguments to `docker-machine`. + +### [Q: I don't see any installation instructions in the CML docs. Am I missing something?](https://discordapp.com/channels/485586884165107732/728693131557732403/733659483758133269) + +Nope, there's no installation unless you wish to install CML in your own Docker +image. As long as you are using GitHub Actions or GitLab CI with the CML Docker +images, no other steps are needed. + +If you're creating your own Docker image to be used in a GitHub Action or GitLab +CI pipeline, you can add CML to your image via npm: + +```bash +$ npm i -g @dvcorg/cml +``` + +### [Q: Can I use CML with MLFlow?](https://www.youtube.com/watch?v=9BgIDqAzfuA&lc=Ugw-VxQqAaqi9hmqB3t4AaABAg) + +CML is designed to integrate with lots of tools that ML teams are already +familiar with. For example, we set up a wrapper to use CML with Tensorboard, so +you get a link to your Tensorboard in a PR whenever your model is training +([check out the use case](https://github.com/iterative/cml_tensorboard_case/pull/3)). + +While we haven't yet tried to create a use case with MLFlow in particular, we +think a similar approach could work. We could imagine using MLFlow for +hyperparameter searching, for example, and then checking in your best model with +Git to a CI system for evaluation in a production-like environment. CML could +help you orchestrate compute resources for model evaluation in your custom +environment, pulling the model and any validation data from cloud storage, and +reporting the results in a PR. + +If this is something you're interested in, make an issue on our project +repository to tell us more about your project and needs- that lets us know it's +a priority in the community. + +### Q: Are there more tutorial videos coming? + +Yes! We recently launched +[our first CML tutorial video](https://dvc.org/blog/first-mlops-tutorial), and a +lot of folks let us know they want more. We're aiming to release a new video +every week or so in the coming months. Topics will include: + +- Using DVC to push and pull data from cloud storage to your CI system +- Using CML with your on-premise hardware +- Building a data dashboard in GitHub & GitLab for monitoring changes in dynamic + datasets +- Provisioning cloud compute from your CI system +- Creating a custom Docker container for testing models in a production-like + environment + +We really want to know what use cases, questions, and issues are most important +to you. This will help us make videos that are most relevant to the community! +If you have a suggestion or idea, no matter how small, we want to know. Leave a +[comment on our videos](https://youtu.be/9BgIDqAzfuA), +[reach out on Twitter](https://twitter.com/dvcorg), or +[ping us in Discord](https://discord.gg/bzA6uY7). diff --git a/content/blogs/2020-07-24-first-mlops-tutorial.md b/content/blogs/2020-07-24-first-mlops-tutorial.md new file mode 100644 index 0000000000..abb8ed97c8 --- /dev/null +++ b/content/blogs/2020-07-24-first-mlops-tutorial.md @@ -0,0 +1,67 @@ +--- +title: | + NEW VIDEO! 🎥 MLOps Tutorial #1: + Intro to continuous integration for ML +date: 2020-07-24 +description: > + A video tutorial about using continuous integration in data science and + machine learning projects. This tutorial shows how to use GitHub Actions and + Continuous Machine Learning (CML) to create your own automated model training + and evaluation system. +picture: 2020-07-24/blog_header.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/new-video-mlops-tutorial-1-intro-to-continuous-integration-for-ml/454 +tags: + - CI/CD + - DevOps + - MLOps + - CML + - Tutorial +--- + +Earlier this month, we launched [CML](https://cml.dev), our latest open-source +project in the MLOps space. We think it's a step towards establishing powerful +DevOps practices (like continuous integration) as a regular fixture of machine +learning and data science projects. But there are plenty of challenges ahead, +and a big one is _literacy_. + +So many data scientists, like developers, are self-taught. Data science degrees +have only recently emerged on the scene, which means if you polled a handful of +senior-level data scientists, there'd almost certainly be no universal training +or certificate among them. Moreover, there's still no widespread agreement about +what it takes to be a data scientist: is it an engineering role with a little +bit of Tensorflow sprinkled on top? A title for statisticians who can code? +We're not expecting an easy resolution to these existential questions anytime +soon. + +In the meantime, we're starting a video series to help data scientists curious +about DevOps (and developers and engineeers curious about data science!) get +started. Through hands-on coding examples and use cases, we want to give data +science practitioners the fundamentals to explore, use, and influence MLOps. + +The first video in this series uses a lightweight and fairly popular data +science problem- building a model to predict wine quality ratings- as a +playground to introduce continuous integration. + +The tutorial covers: + +- Using Git-flow in a data science project (making a feature branch and pull + request) +- Creating your first GitHub Action to train and evaluate a model +- Using CML to generate visual reports in your pull request summarizing model + performance + +It's now up on YouTube! + +https://youtu.be/9BgIDqAzfuA + +[Code for the project is available online](https://github.com/andronovhopf/wine) +so you can follow along! We also recommend checking out the +[CML docs](https://github.com/iterative/cml) for more details, tutorials, and +use cases. + +If you have questions, the best way to get in touch is by leaving a comment on +the blog, video, or our [Discord channel](https://discord.gg/bzA6uY7). And, +we're especially interested to hear what use cases you'd like to see covered in +future videos- tell us about your data science project and how you could imagine +using continuous integration, and we might be able to create a video! diff --git a/content/blogs/2020-07-27-shtab-completion-release.md b/content/blogs/2020-07-27-shtab-completion-release.md new file mode 100644 index 0000000000..33246b8ed1 --- /dev/null +++ b/content/blogs/2020-07-27-shtab-completion-release.md @@ -0,0 +1,212 @@ +--- +title: '(Tab) Complete Any Python Application in 1 Minute or Less' +date: 2020-07-27 +description: > + We've made a painless tab-completion script generator for Python applications! + Find out how to take advantage of it in this blog post. +descriptionLong: > + We've made a painless tab-completion script generator for Python applications! + It's called `shtab` and it currently works with `argparse`, `docopt`, and + `argopt` to produce `bash` and `zsh` completion scripts. This tool was + originally created to help `dvc`, but we realised it could be made more + generic and valuable to the world's entire ecosystem of Python CLI + applications. Find out how to take advantage of it in this blog post. +picture: 2020-07-27/tab-py.jpg +pictureComment: Zero Effort Tab Completion for Python Applications +author: casper_dcl +commentsUrl: https://discuss.dvc.org/t/tab-complete-any-python-application-in-1-minute-or-less/455 +tags: + - shtab + - Release + - CLI + - Autocomplete + - Tab + - Completion + - Python +--- + +Command line tools are powerful. Things like [`make`] have manual pages +spanning, well, +[pages](https://www.gnu.org/software/make/manual/make.html#Options-Summary), +while just the list of [`git`] subcommands is longer than can fit on a standard +`80 x 24` terminal screen. + +```dvc +$ git +add filter-branch rebase +am format-patch reflog +annotate fsck relink +... +describe prco unassume +--More-- +``` + +Notice the `--More--` at the bottom? That's the joy of pagination. + +Notice the `` at the top? That represents actually pressing the tab key. +Ah, the joy of shell tab completion. + +Tab completion is an indispensable part of writing anything on the command-line. +Personally, I can't imagine trying to `git co` (aliased to `git checkout`) a +branch without `` to do the heavy lifting. +[They say](https://en.wikipedia.org/wiki/Letter_frequency) "E" is the most +common vowel, and "T" the most common consonant. My keyboard use probably looks +more like this: + +![](../uploads/images/2020-07-27/key-frequencies.png 'Yes, I use vim =500')_My +key usage_ + +Now, there's a tool called `dvc` which is like [Git for data](https://dvc.org). +It can be viewed as a cross-platform combination of [`git`] and [`make`] +designed for handling big data and multiple cloud storage repositories, as well +as tracking machine learning experiments. As you can imagine, supporting that +many buzzwords means it also has a large number of subcommands and options. + +_Every time a new feature is added, maintainers and contributors have to update +tab completion scripts for multiple supported shells. At best, it's a pain, and +at worst, error-prone. If you've worked on maintaining CLI applications, you'll +sympathise._ + +Surely the parser code you've written is informative enough to automate tab +completion? Surely you shouldn't have to maintain and synchronise separate tab +completion scripts? + +Good news: [`shtab`] is a new tool which magically does all of this work. + +Any Python CLI application using [`argparse`], [`docopt`], or [`argopt`] can +have tab completion for free! + +Simply hand your parser object to `shtab` (either via the CLI or the Python +API), and a tab completion script will be generated for your preferred shell. +It's as easy as: + +- CLI: `shtab --shell=bash myprogram.main.parser`, or +- Python API: `import shtab; print(shtab.complete(parser, shell="bash"))`. + +### `argparse` example + +Suppose you have some code in a module `hello.main`: + +```python +import argparse + +def get_main_parser(): + parser = argparse.ArgumentParser(prog="hello") + parser.add_argument( + "who", help="good question", nargs="?", default="world") + parser.add_argument( + "--what", help="a better question", default="hello", + choices=["hello", "goodbye"]) + return parser + +if __name__ == "__main__": + parser = get_main_parser() + args = parser.parse_args() + print("{}, {}!".format(args.what, args.who)) +``` + +To get tab completion for `bash`, simply install [`shtab`] and then run: + +```bash +shtab --shell=bash hello.main.get_main_parser \ + | sudo tee "$BASH_COMPLETION_COMPAT_DIR"/hello >/dev/null +``` + +Zsh user? Not a problem. Simply run: + +```bash +shtab --shell=zsh hello.main.get_main_parser \ + | sudo tee /usr/local/share/zsh/site-functions/_hello >/dev/null +# note the underscore `_` prefix in the filename +``` + +Handily you can install `shtab`'s own completions by following the above +examples replacing `hello` with `shtab`. + +![](../uploads/images/2020-07-27/dvc.gif)_`shtab`-driven `dvc` completion in +`bash` and `zsh`_ + +Using `shtab`, here's what +[`dvc`'s completion](https://dvc.org/doc/install/completion) looks like when +installed: + +```dvc +% dvc +Completing dvc commands +add -- Track data files or directories with DVC. +cache -- Manage cache settings. +checkout -- Checkout data files from cache. +commit -- Save changed data to cache and update DVC-files. +completion -- Prints out shell tab completion scripts. +At Top: Hit TAB for more, or the character to insert +``` + +All completion suggestions guaranteed in-sync with the code! The maintainers of +`dvc` were very surprised to find no less than +[84 commits](https://github.com/iterative/dvc/commits/main/scripts/completion) +touching their old completion scripts. Such churn is now a thing of the past! + +You might notice one of the subcommands provided by `dvc` is +[`completion`](https://dvc.org/doc/install/completion). Here's a quick example +of how to provide such convenience for users: + +### Integrating library example + +Feeling minimal? How about adding `import shtab` to your application itself for +a cleaner user interface? And let's use [`argopt`] to convert [`docopt`]'s neat +syntax to `argparse` while we're at it. + +```python +"""Greetings and partings. + +Usage: + greeter [options] [] [] + +Options: + -g, --goodbye : Say "goodbye" (instead of "hello") + -b, --print-bash-completion : Output a bash tab-completion script + -z, --print-zsh-completion : Output a zsh tab-completion script + +Arguments: + : Your name [default: Anon] + : My name [default: Casper] +""" +import sys, argopt, shtab + +parser = argopt.argopt(__doc__) +if __name__ == "__main__": + args = parser.parse_args() + if args.print_bash_completion: + print(shtab.complete(parser, shell="bash")) + sys.exit(0) + if args.print_zsh_completion: + print(shtab.complete(parser, shell="zsh")) + sys.exit(0) + + msg = "k thx bai!" if args.goodbye else "hai!" + print("{} says '{}' to {}".format(args.me, msg, args.you)) +``` + +### Try it out + +There are many more options and features. The [documentation][`shtab`] includes +examples of working with custom file completions and providing a `completion` +subcommand when integrating more tightly with existing applications. + +Try it out with `pip install -U shtab` or `conda install -c conda-forge shtab`! + +Is it worth the time? + +![](https://imgs.xkcd.com/comics/is_it_worth_the_time.png)_It's worth it +[xkcd#1205](https://xkcd.com/1205)_ + +[`shtab`] would be on the second row, far left (maybe even off grid). It's worth +spending days to get right yet only takes seconds to install. + +[`argopt`]: https://pypi.org/project/argopt +[`argparse`]: https://docs.python.org/library/argparse +[`docopt`]: https://pypi.org/project/docopt +[`dvc`]: https://github.com/iterative/dvc +[`git`]: https://git-scm.com +[`make`]: https://en.wikipedia.org/wiki/Make_(software) +[`shtab`]: https://github.com/iterative/shtab diff --git a/content/blogs/2020-08-07-cml-self-hosted-runners-on-demand-with-gpus.md b/content/blogs/2020-08-07-cml-self-hosted-runners-on-demand-with-gpus.md new file mode 100644 index 0000000000..c3c2636e29 --- /dev/null +++ b/content/blogs/2020-08-07-cml-self-hosted-runners-on-demand-with-gpus.md @@ -0,0 +1,178 @@ +--- +title: CML self-hosted runners on demand with GPUs +date: 2020-08-07 +description: > + Use your own GPUs with GitHub Actions & GitLab for continuous machine + learning. +descriptionLong: > + Training models often requires special hardware, like extra memory or GPUs. + How can we make a CI/CD pipeline with this hardware? Find out how to set up + your own self-hosted runners on-demand with GPUs for fast training. +picture: 2020-08-07/header.png +author: david_g_ortega +commentsUrl: https://discuss.dvc.org/t/cml-self-hosted-runners-on-demand-with-gpus/462 +tags: + - CML + - CI/CD + - MLOps + - GPUs + - Self-hosted runners + - Reproducibility + - Tutorial +--- + +When creating your CI/CD workflow for a machine learning (ML) project, you might +find that by default, neither GitHub Actions nor GitLab CI provides the +computing capabilities you need- like GPUs, high memory instances, or multiple +cores. + +To overcome this hardware hurdle, one practical approach is to use self-hosted +runners: runners that you manage, but are accessible to your CI/CD system for +executing jobs. It could be an EC2 instance or the GPU under your desk. In our +[recently-released project](https://dvc.org/blog/cml-release), Continuous +Machine Learning (CML), our Docker image acts as a thin wrapper over GitLab and +GitHub runners, adding some extra capabilities. + +Here are some benefits of using CML with a self-hosted runner: + +1. **Easy to use.** Working the same way for both GitLab and GitHub. + +2. **Get out of dependency hell.** We tend to install packages (on top of + packages, on top of packages…) while we‘re experimenting with models. In ML + in particular, we can be dependent on drivers AND libraries, and sometimes + precise versions of them (CUDA and TensorFlow, anyone?). Your CI workflow + will install all the dependencies in the containerised runner leaving your + machine always clean. + +3. **Security.** If your repo is public your runners could be accessed by + anyone that could add + [scripts that exploits your machine](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners#self-hosted-runner-security-with-public-repositories). + With the containerised runner you are restricting the access to your real + machine. + +4. **Gain reproducibility.** One of the biggest technical debts in the ML space + is reproducibility. A few weeks post-experiment, we often discover that + trying to put your model back in shape is a pain. Looking at our repo, it’s + not obvious what data or training infrastructure or dependencies went into a + given result. When you move your ML experiments into a CI/CD system you are + making a contract of the dependencies and hardware used for your experiment. + Having that contract isolated by the containerised runner, your experiment + is perfectly reproducible by anyone in the future. + +## Hands on GPU Self-hosted runners 101 + +### 1) Install nvidia drivers and nvidia-docker in your machine (ubuntu 18.04) + +```dvc +$ curl -s -L https://nvidia.GitHub.io/nvidia-docker/gpgkey | sudo apt-key add - && \ + curl -s -L https://nvidia.GitHub.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list && \ + sudo apt update && sudo apt install -y ubuntu-drivers-common && \ + sudo ubuntu-drivers autoinstall && \ + sudo apt install -y nvidia-container-toolkit && \ + sudo systemctl restart docker +``` + +You can test that your gpus are up and running with the following command: + +```dvc +$ docker run --gpus all iterativeai/cml:0-dvc2-base1-gpu nvidia-smi +``` + +We should see something like this: +![](../uploads/images/2020-08-07/nvidia-smi-output.png) + +### 2) Start your self-hosted runner + +With CML docker images launching your own self-hosted runner is very easy. These +images have CML and DVC preinstalled (among other perks), plus CUDA drivers. +That's all. You can clone these images and add your own dependencies to better +mimic your own production environment. + +```dvc +$ docker run --name myrunner -d --gpus all \ + -e RUNNER_IDLE_TIMEOUT=1800 \ + -e RUNNER_LABELS=cml,gpu \ + -e RUNNER_REPO=$my_repo_url \ + -e repo_token=$my_repo_token \ + iterativeai/cml:0-dvc2-base1-gpu runner +``` + +where: + +`RUNNER_IDLE_TIMEOUT` is the time in seconds that the runner is going to be idle +at most waiting for jobs to come, if no one comes the runner shuts down and +unregisters from your repo. + +`RUNNER_LABELS` a comma delimited list of labels that we are setting in our +workflow that the jobs will wait for. + +`RUNNER_REPO` is the url of your GitLab or GitHub repo. repo_token is the +personal token generated for your GitHub or GitLab repo. Note that for GitHub +you must check `workflow` along with `repo`. + +If everything went fine we should see a runner registered in our repo. + +![](../uploads/images/2020-08-07/registered-cml-runner-github.png) + +![](../uploads/images/2020-08-07/registered-cml-runner-gitlab.png) + +### 3) Setup your GitHub Actions or GitLab workflow yaml file to use the runner and commit your changes. + +GitLab + +```yaml +train: + tags: + - cml + - gpu + script: + - echo 'Hi from CML!' >> report.md + - cml send-comment report.md +``` + +GitHub + +```yaml +name: train-my-model +on: [push] +jobs: + train: + runs-on: [self-hosted, cml, gpu] + steps: + - uses: actions/checkout@v2 + - name: cml_run + run: | + echo 'Hi from CML!' >> report.md + cml send-comment report.md +``` + +Congrats! At this point you have done all the steps to have your GPUs up and +running with CML. + +# Limitations and future directions + +There are still some limitations to be solved at this stage: + +- GitHub Actions + [can’t run a workflow longer than 72 hours](https://docs.github.com/en/actions/getting-started-with-github-actions/about-github-actions#usage-limits). + +- Self-hosted runners + [don’t behave well when they disconnect from the repo](https://GitLab.com/GitLab-org/GitLab/-/issues/229851#note_390371734), + limiting the possibilities with preemptible instances (also known as spot + instances). + +We’re working on these issues (see issues +[#161](https://github.com/iterative/cml/issues/161), +[#174](https://github.com/iterative/cml/issues/174), and +[#208](https://github.com/iterative/cml/issues/208)) both in terms of CML and +DVC capabilities. So keep watching this space for updates! + +
+ +We started CML to help teams deal with the complexity of ML more effectively- +continuous integration is a proven approach to keeping projects agile even as +the team size, number of experiments, and number of dependencies increase. +Treating experiments like potential new features in a software project opens up +many possibilities for improving our engineering practices. We’re looking +forward to an era when ML experiments can be created, logged, and merged into +production-ready code in minutes, not days or weeks. diff --git a/content/blogs/2020-08-10-august-20-dvc-heartbeat.md b/content/blogs/2020-08-10-august-20-dvc-heartbeat.md new file mode 100644 index 0000000000..66af8f1b93 --- /dev/null +++ b/content/blogs/2020-08-10-august-20-dvc-heartbeat.md @@ -0,0 +1,216 @@ +--- +title: August ’20 Heartbeat +date: 2020-08-10 +description: > + Catch our monthly updates- featuring the CML release, DVC meetup recap, a new + video tutorial series, and the best reading about pipelines and DataOps. +descriptionLong: > + Catch our monthly updates- featuring the CML release, DVC meetup recap, a new + video tutorial series, and the best reading about pipelines and DataOps. +picture: 2020-08-10/header.png +pictureComment: DeeVee avoids the summer sun at Mount Rainier National Park. +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/august-20-heartbeat/465 +tags: + - Heartbeat + - CML + - DVC + - Meetup +--- + +Welcome to our August roundup of cool news, new releases, and recommended +reading in the MLOps world! + +## News + +### CML release + +At the beginning of July, we went live with a new project: +[Continuous Machine Learning, or CML](https://cml.dev) for short. If you +hadven't heard, CML is an open-source toolkit for adapting popular continuous +integration systems like GitHub Actions and GitLab CI for machine learning and +data science. This release marks a new stage for our organization: while CML can +work with DVC, and both are built around Git, CML is designed for standalone +use. That means we're supporting TWO projects now! + +![Threaten Ashley Olsen GIF](https://media.giphy.com/media/X5i2BoQeD9kWY/giphy.gif) + +Luckily, we received plenty of encouraging and helpful feedback following the +CML release. CML was on the front page of Hacker News for most of release day! +We also got +[covered on Heise](https://www.heise.de/news/Machine-Learning-CML-schickt-Daten-und-Modelltraining-in-die-Pipeline-4841023.html), +a popular German IT news source. I (Elle, a proud part of the CML team!) also +gave a talk presenting our approach as part of the MLOps World meeting, which is +now available for online viewing. + +https://youtu.be/yp0su5mOeko + +Of course, we're fielding lots of questions too! We've compiled some of the most +common questions (and their answers!) in our last +[Community Gems post](https://dvc.org/blog/july-20-community-gems), and CML +developer [David G. Ortega](https://github.com/DavidGOrtega) has written a +tutorial for a much-asked-for use case: doing +[continuous integration with on-demand GPUs](https://dvc.org/blog/cml-self-hosted-runners-on-demand-with-gpus). + +If you have comments, questions, or feature requests about CML, we _really_ want +to hear from you. A few ways to be in touch: + +- Open an [issue on the project repo](https://github.com/iterative/cml/issues) +- Drop by the [CML Discord channel](https://discord.gg/bzA6uY7) +- Send us [an email](mailto:support@dvc.org) + +### July Meetup + +Last week, we had another meetup! +[DVC Ambassador Marcel](http://mribeirodantas.me/) kicked us off with a short +talk about how he's using DVC as part of his causal modeling approach to +bioinformatics. It's cool stuff. Then, I talked a bit about CML and did some +live-coding. The beauty of live-coding is getting to answer questions in +real-time, and if you're totally new to the idea of continuous integration (or +want to understand how CML works with GitHub Actions/GitLab CI) seeing a project +in-action is one of the best ways to learn. + +You can watch a recording of the meetup online now (it's lightly edited to +remove some pesky Zoom trolls), and +[join our Meetup group](https://www.meetup.com/DVC-Community-Virtual-Meetups) to +get updates for the next one. In future meetups, we'd love to support community +members sharing their work, so get in touch if you'd like to present. + +https://youtu.be/tnTPHG5seDs + +### New video series + +We're starting up some new YouTube features! If you haven't seen our channel, +[check it out and consider subscribing](https://www.youtube.com/channel/UC37rp97Go-xIX3aNFVHhXfQ) +for hands-on tutorials and demos. Our +[first video introduced continuous integration and GitHub Actions](https://youtu.be/9BgIDqAzfuA), +and the second showed +[how to use DVC and free Google Drive storage to add external data storage to a GitHub project](https://youtu.be/kZKAuShWF0s). + +In the coming weeks, we'll be covering: + +- Using CML and GitHub Actions with hardware for deep learning, like on-premise + GPUs +- Understanding Vega plots and making data viz part of your CI system +- Some DVC basics to supplement our docs + +## From the community + +### SpaCy + DVC = ❤️ + +We're huge fans of a recent Python Bytes episode featuring +[Ines Montani](https://twitter.com/_inesmontani), founder of Explosion and one +of the makers of the incredible SpaCy library for NLP (seriously, I have the +highest recommendations for SpaCy). + +> My [@PythonBytes](https://twitter.com/pythonbytes) episode is out now! +> +> 🎙️ Listen here: [https://t.co/fHLF2hR4cM](https://t.co/fHLF2hR4cM) +> +> My picks of the week are: +> 🐙 TextAttack by @jxmorris12: +> [https://t.co/jySYrtzzp8](https://t.co/jySYrtzzp8) +> 🦉 Data Version Control (DVC) [@DVCorg](https://twitter.com/DVCorg): +> [https://t.co/3610F6kv8v](https://t.co/3610F6kv8v) +> 🐍 Built-in generic types in 3.9 +> +> — Ines Montani 〰️ (@\_inesmontani) +> [July 23, 2020](https://twitter.com/_inesmontani/status/1286222512762871808) + +Ines' episode discussed DVC, and DVC is going to be integrated with SpaCy in +their 3.0 release. SpaCy + DVC is going to be a powerhouse and we can't wait. + +### Take a stab at shtab + +Another cool software project: [Casper da Costa-Luis](https://cdcl.ml), DVC +contributor and creator of the popular +[tqdm library](https://github.com/tqdm/tqdm), has published a tab-completion +script generator for Python applications! `shtab`, as it's called, was +originally designed for DVC, but Casper developed it into a generic tool that +can be used for virtually any Python CLI application. Check out +[`shtab` on GitHub](https://github.com/iterative/shtab) and read the release +blog. + + + +### DVC 1.0 migration script + +Our friends at [DAGsHub](https://dagshub.com/) have released a script to help +DVC users upgrade their pipelines to the new DVC 1.0 format! Says Simon, a +DAGsHub engineer, in his tutorial: + +> In this post, I'll walk you through the process of migrating your existing +> project from DVC ≤ 0.94 to DVC 1.X using a single automated script, and then +> demonstrate a way to check that your migration was successful. + +Read the blog and get migrating (but don't worry if you can't; DVC 1.0 is +backwards compatible). + +### Recommended reading + +Here are some of our favorite blogs from around the internet 🌏. + +- [Déborah Mesquita](https://deborahmesquita.com/), data scientist (and an + excellent writer to follow), published a tutorial about DVC pipelines that is + truly deserving of the moniker "ultimate guide". It's a start-to-finish case + study about a typical machine learning project, with DVC pipelines to automate + everything from grabbing the data to training and evaluating a model. Also, it + comes with a video tutorial if you prefer to watch instead of read! + + + +- Software engineer + [Vaithy Narayanan](https://www.linkedin.com/in/vaithyanathan/) created the + first ever ☝️ CML user blog! Vaithy created a pipeline that covers data + collection to model training and testing, and used CML to automate the + pipeline execution whenever the project's GitHub repository is updated. He + ends with some insightful discussion about the strengths and weaknesses of the + approach. + + + +- [Ryan Gross](https://www.linkedin.com/in/ryan-w-gross/), a VP at Pariveda + Solutions, blogged about the future of data governance and the lessons from + DevOps that might save the day. Honestly, you should probably start reading + for this cover image alone. + + ![](../uploads/images/2020-08-10/dataops.png) _DataOps is accurately depicted + as a badass flaming eagle._ Check out the blog here: + + + +And, there's a +[noteworthy counterpoint](https://locallyoptimistic.com/post/git-for-data-not-a-silver-bullet/?utm_campaign=Data_Elixir&utm_source=Data_Elixir_298) +by +[Michael Kaminsky](https://www.linkedin.com/in/michael-the-data-guy-kaminsky/). +Read them both! + +Thanks everyone, that's it for this month. We hope you're staying safe and +making cool things! + +![Reaction GIF by MOODMAN](https://media.giphy.com/media/35EsMpEfGHkVoHbNTU/giphy.gif) diff --git a/content/blogs/2020-08-27-august-20-community-gems.md b/content/blogs/2020-08-27-august-20-community-gems.md new file mode 100644 index 0000000000..1650a45fda --- /dev/null +++ b/content/blogs/2020-08-27-august-20-community-gems.md @@ -0,0 +1,194 @@ +--- +title: August '20 Community Gems +date: 2020-08-27 +description: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + using CI/CD to validate models, advanced DVC pipeline scenarios, and how CML + adds pictures to your GitHub and GitLab comments. +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + using CI/CD to validate models, advanced DVC pipeline scenarios, and how CML + adds pictures to your GitHub and GitLab comments. +picture: 2020-08-27/Gems_Aug_20.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/august-20-community-gems/477 +tags: + - Community Gems + - CML + - Hyperparameters + - Git LFS + - Pipelines +--- + +Here are some of our top Q&A's from around the community. With the launch of +[CML](https://cml.dev) earlier in the month, we've got some new ground to cover! + +## DVC questions + +### [Q: What's the relationship between the DVC remote and cache? If I have an external cache, do I really need a DVC remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/747588572479094866) + +You can think of your DVC remote similar to your Git remote, but for data and +model artifacts- it's a place to backup and share artifacts. It also gives you +methods to push and pull those artifacts to and from your team. + +Your DVC cache (by default, it's located in `.dvc/cache`) serves a similar +purpose to your Git objects database (which is by default located in +`.git/objects`). They're both _local_ caches that store files (including various +versions of them) in a content-addressable format, which helps you quickly +checkout different versions to your local workspace. The difference is that +`.dvc/cache` is for data/model artifacts, and `.git/objects` is for code. + +Usually, your DVC remote is a superset of `.dvc/cache`- everything in your cache +is a copy of something in your remote (though there may be files in your DVC +remote that are not in your cache (and vice versa) if you have never attempted +to `push` or `pull` them locally). + +In theory, if you are using an +[external cache](https://dvc.org/doc/use-cases/fast-data-caching-hub#example-shared-development-server)- +meaning a DVC cache configured on a separate volume (like NAS, large HDD, etc.) +outside your project path- and all your projects and all your teammates use that +external cache, and you _know_ that the storage is highly reliable, you don't +need to also have a DVC remote. If you have any doubts about access to your +external cache or its reliability, we'd recommend also keeping a remote. + +### [Q: One of my files is an output of a DVC pipeline, and I want to track this file with Git and store it in my Git repository since it isn't very big. How can I make this work?](https://discordapp.com/channels/485586884165107732/563406153334128681/732308317627613235) + +Yes! There are two approaches. We'll be assuming you have a pipeline stage that +outputs a file, `myfile`. + +- If you haven't declared the pipeline stage with `dvc run` yet, then you'll do + it like this: + +```dvc +$ dvc run -n -d -O myfile +``` + +Note that instead of using the flag `-o` for specifying the output `myfile`, +we're using `-O`- it's shorthand for `--outs-no-cache`. You can +[read about this flag in our docs](https://dvc.org/doc/command-reference/run#options). + +- If you've already created your pipeline stage, go into your `dvc.yaml` and + manually add the field `cache: false` to the stage as follows: + +```yaml +outs: + - myfile: + cache: false +``` + +Please note one special case: if you previously enabled hardlinks or symlinks in +DVC via `dvc config cache`, you may need to run `dvc unprotect myfile` to fully +unlink `myfile` from your DVC cache. If you haven't enabled these types of file +links (and if you're not sure, _you probably didn't!_), this step is unncessary. +[See our docs for more.](https://dvc.org/doc/command-reference/unprotect) + +### [Q: Can I change my `params.yaml` file to a `.json`?](https://discordapp.com/channels/485586884165107732/563406153334128681/730614265051873370) + +Yes, this is straightforward- you change your `params.yaml` to `params.json` in +your workspace, and then use it in `dvc run`: + +```dvc +$ dvc run -p params.json:myparam ... +``` + +Alternately, if your pipeline stage has already been created, you can manually +edit your `dvc.yaml` file to replace `params.yaml` with `params.json`. + +For more about the `params.yaml` file, +[see our docs](https://dvc.org/doc/start/experiments#defining-parameters). + +### [Q: Is there a guide for migrating from Git-LFS to DVC?](https://discordapp.com/channels/485586884165107732/485596304961962003/743559246599421974) + +We don't know of any published guide. One of our users shared their procedure +for disabling LFS: + +```dvc +$ git lfs uninstall +$ git rm .gitattributes +$ git rm .lfsconfig +``` + +Then you can `dvc add` files you wish to put in DVC tracking, and `dvc push` +them to your remote. After that, `git commit` and you're good! + +Note that, if you're going to delete any LFS files, make sure you're certain the +corresponding data has been transferred to DVC. + +### [Q: Is there a way to use DVC and CML to validate a model in a GitHub Action, without making the validation data available to the user opening the Pull Request?](https://discordapp.com/channels/485586884165107732/485596304961962003/739202123295883325) + +We don't have special support for this use case, and there may be some security +downsides to using a confidential validation dataset with someone else's code +(be sure nothing in their code could expose your data!). But, there are ways to +implement this if you're sure about it. + +One possible approach is to create a separate "data registry" repository using a +private cloud bucket to store your validation dataset +([see our docs about the why and how of data registries](https://dvc.org/doc/use-cases/data-registries#data-registries)). +Your CI system can be setup to have access to the data registry via secrets +(called "variables" in GitLab). Then when you run validation via +`dvc repro validate`, you could use `dvc get` to pull the private data from the +registry. + +The data is never exposed to the user in an interactive setting, only on the +runner- and there it's ephemeral, meaning it does not exist once the runner +shuts down. + +## CML questions + +### [Q: Sometimes when I make a commit on a branch, my CI workflow isn't triggered. What's going on?](https://www.youtube.com/watch?v=9BgIDqAzfuA&lc=UgwKIYsCo194AErdeBJ4AaABAg) + +If your workflow is set to trigger on a push (as in the CML use cases), it isn't +enough to `git commit` locally- you need to push to your GitHub or GitLab +repository. If you want every commit to trigger your workflow, you'll need to +push each one! + +What about if you _don't_ want a push to trigger your worfklow? In GitLab, you +can use the +[`[ci skip]` flag](https://docs.gitlab.com/ee/ci/yaml/#skip-pipeline)- make sure +your commit message contains `[ci skip]` or `[skip ci]`, and GitLab CI won't run +the pipeline in your `gitlab-ci.yml` file. + +In GitHub Actions, this flag isn't supported, so you can manually kill any +workflows in the Actions dashboard. For a programmatic fix, +[check out this workaround by Tim Heuer](https://timheuer.com/blog/skipping-ci-github-actions-workflows/). + +### [Q: Can I do the bulk of my model training outside of my CI system, and then share the result with CML?](https://twitter.com/peterkuai/status/1295899690404175872) + +Definitely! This is a desirable workflow in several cases: + +- You have a preferred approach for experiment tracking (for example, DVC or + MLFlow) that you want to keep using +- You don't want to set up a self-hosted runner to connect your computing + resources to GitHub or GitLab +- Training time is on the order of days or more + +CML is very flexible, and one strong use case is for sanity checking and +evaluating a model in a CI system post-training. When you have a model that +you're satisifed with, you can check it into your CI system and use CML to +evaluate the model in a production-like environment (such as a custom Docker +container), report its behavior and informative metrics. Then you can decide if +it's ready to be merged into your main branch. + +### [Q: Can I make a CML report comparing models across different branches of a project?](https://github.com/iterative/cml/issues/188) + +Definitely. This is what `dvc metrics diff` is for- like a `git diff`, but for +model metrics instead of code. We made a video about how to do this in CML! + +https://youtu.be/xPncjKH6SPk + +### [Q: In the function `cml publish`, it looks like you're uploading published files to `https://asset.cml.dev`. Why don't you just save images in the Git repository?](https://discordapp.com/channels/485586884165107732/728693131557732403/745168931521822740) + +If an image file is created as part of your workflow, it's ephemeral- it doesn't +exist outside of your CI runner, and will disappear when your runner is shut +down. To include an image in a GitHub or GitLab comment, a link to the image +needs to persist. You could commit the image to your repository, but typically, +[it's undesireable to automatically commit results of a CI workflow](https://stackoverflow.com/questions/61245284/is-it-necessary-to-commit-dvc-files-from-our-ci-pipelines). + +We created a publishing service to help you host files for CML reports. Under +the hood, our service uploads your file to an S3 bucket and uses a key-value +store to share the file with you. + +This covers a lot of cases, but if the files you wish to publish can't be shared +with our service for security or privacy reasons, you can emulate the +`cml publish` function with your own storage. You would push your file to +storage and include a link to its address in your markdown report. diff --git a/content/blogs/2020-09-09-september-20-dvc-heartbeat.md b/content/blogs/2020-09-09-september-20-dvc-heartbeat.md new file mode 100644 index 0000000000..44e8dce7c5 --- /dev/null +++ b/content/blogs/2020-09-09-september-20-dvc-heartbeat.md @@ -0,0 +1,188 @@ +--- +title: September ’20 Heartbeat +date: 2020-09-09 +description: > + This month, catch us on the Software Engineering Daily Podcast, check out our + favorite DVC and CML tutorials and projects, and celebrate 1000 YouTube + subscribers! +descriptionLong: > + This month, catch us on the Software Engineering Daily Podcast, check out our + favorite DVC and CML tutorials and projects, and celebrate 1000 YouTube + subscribers! +picture: 2020-09-09/header.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/september-20-heartbeat/488 +tags: + - Heartbeat + - CML + - DVC + - R + - Meetup + - Videos +--- + +## News + +### Dmitry on Software Engineering Daily + +Our CEO Dmitry Petrov was interviewed on the much-beloved Software Engineering +Daily podcast! Host [Jeff Meyerson](https://twitter.com/the_prion) kicked off +the discussion: + +> Code is version controlled through Git, the version control system originally +> built to manage the Linux codebase. For decades, software has been developed +> using git for version control. More recently, data engineering has become an +> unavoidable facet of software development. It is reasonable to ask–why are we +> not version controlling our data? + +For the rest of the episode, listen here! + + + +### Contributor's meetup + +Last week, we held a meetup for contributors to DVC! Core maintainer +[Ruslan Kupriev](https://github.com/efiop) hosted a get-together for folks who +contribute new features, bug fixes, and more to the community. If you missed it, +you can watch it on YouTube. + +https://youtu.be/jUYSTERXxWg + +### New videos + +We've released several new videos to our growing +[YouTube channel](https://www.youtube.com/channel/UC37rp97Go-xIX3aNFVHhXfQ)- and +cool news, we passed 1,000 subscribers! The support has been surprising in the +best way possible. We're seeing a lot of repeat commenters and folks from the +DVC meetups! It's been so rewarding to get positive feedback from the community +and we're planning to build our YouTube presence even more. + +![Happy GIF](https://media.giphy.com/media/ZE0JppdERv8t4jVCAt/giphy.gif) + +_Even Skeletor finds joy in this._ + +We now have 4 tutorials in our MLOps series. In the latest, we cover how to use +your own GPU (on-premise or in the cloud) to run GitHub Actions workflows. Check +it out and give it a try, the code examples are freely available :) + +https://youtu.be/rVq-SCNyxVc + +We also made our first ever "explainer" video to talk through how DVC works in +five minutes. + +https://youtu.be/UbL7VUpv1Bs + +As always, video requests are welcome! Reach out and let us know what topics and +tutorials you want to see covered. And we appreciate any likes, shares, and +subscribes on our growing YouTube channel. + +## From the community + +### A three-part CML series (featuring R!) + +DVC ambassador [Marcel Ribeiro-Dantas](https://twitter.com/mribeirodantas) has +published two of three tutorial blogs in a series on CML! Marcel's use case is +especially cool because he's using R, plus some causal modeling related to his +work in bioinformatics, with GitHub Actions. + +In Part I, Marcel introduces his project and how he uses DVC, CML and GitHub +Actions together (with his custom R library). + + + +In Part II, Marcel takes a deeper dive into Docker. He explains how to create a +your own Docker image and test it. This case should be helpful for folks who +want to include the CML library in their own Docker container. + + + +### Real Python talks DVC + +[Kristijan Ivancic](https://twitter.com/kristijan_ivanc) of +[Real Python](realpython.com), a library of online Python tutorials and lessons, +created a _seriously_ impressive DVC tutorial (this thing is a beast 🐺- it has +a table of contents!) + +![](../uploads/images/2020-09-09/Real_Python.png)_How cool is this artwork?_ + +And, the Real Python podcast discussed their DVC tutorial (plus the joys of +version control for data!) on a recent episode. + + + +### Recommended reading + +There's a lot of cool stuff happening out there in the data science world 🌏! + +- [Fabiana Clemente](https://twitter.com/fab_clemente), Chief Data Officer of + [YData](https://ydata.ai/), published a blog for The Startup about four + reasons to start using data version control- and, with her expertise in data + privacy, she's especially well-qualified to explain the role of DVC in + compliance and auditing! Check out her blog (it comes with a quick-start + tutorial, too). + + + +- Ryzal Kamis at the [AI Singapore Makerspace](makerspace.aisingapore.org) + shared a blog (the first of two!) about creating end-to-end CI/CD workflows + for machine learning. In his first blog, Ryzal gives a high-level overview of + the need for data version control and compares several tools in the space. + Then he gives a walkthrough (quite easy to follow!) of how DVC fits in his + workflow. We're eagerly awaiting the second installment of this series, which + promises to bring more advanced automation scenarios and a CI/CD pipeline. + + + +- [Isaac Sacolick](https://www.infoworld.com/author/Isaac-Sacolick/), + contributing editor at InfoWorld, penned an article about the growing field of + MLOps and its role in data-driven businesses. He writes: + +> Too many data and technology implementations start with poor or no problem +> statements and with inadequate time, tools, and subject matter expertise to +> ensure adequate data quality. Organizations must first start with asking smart +> questions about big data, investing in dataops, and then using agile +> methodologies in data science to iterate toward solutions. + +Read the rest here: + + + +Thanks everyone, that's a wrap for this month. Be safe, stay in touch, and get +ready for pumpkin spice latte season 🎃. + +![Cat Fall GIF](https://media.giphy.com/media/EDpVRPFK5bjfq/giphy.gif) diff --git a/content/blogs/2020-09-28-september-20-community-gems.md b/content/blogs/2020-09-28-september-20-community-gems.md new file mode 100644 index 0000000000..24cc5f0a77 --- /dev/null +++ b/content/blogs/2020-09-28-september-20-community-gems.md @@ -0,0 +1,208 @@ +--- +title: September '20 Community Gems +date: 2020-09-28 +description: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + customizing your DVC plots, the difference between external dependencies and + outputs, and how to save models and data in CI. +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month, we discuss + customizing your DVC plots, the difference between external dependencies and + outputs, and how to save models and data in CI. +picture: 2020-09-28/Gems_Sept_20.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/september-20-community-gems/512 +tags: + - Community Gems + - CML + - Hyperparameters + - External Data + - SSH + - Vega +--- + +## DVC questions + +### [Q: When I try to push to my DVC remote, I get an error about my SSH-RSA keys. What's going on?](https://discordapp.com/channels/485586884165107732/485596304961962003/748735263634620518) + +If you're using DVC with an SSH-protected remote, DVC uses a Python library +called `paramiko` to create a connection to your remote. There is a +[known issue](https://stackoverflow.com/questions/51955990/base64-decoding-error-incorrect-padding-when-loading-putty-ppk-private-key-to) +that `paramiko` expects RSA keys in OpenSSH key format, and can throw an error +if the keys are in an alternative format (such as default PuTTY formatted keys). +If this is the case, you'll likely see: + +``` +ERROR: unexpected error - ('... ssh-rsa ...=', Error('Incorrect padding',)) +``` + +To fix this, convert your RSA key to the OpenSSH format. Tools like +[PuTTYgen](https://www.puttygen.com/) and +[MobaKeyGen](https://mobaxterm.mobatek.net/) can help you do this. + +### [Q: Can I have multiple `param.yaml` files in a project?](https://discordapp.com/channels/485586884165107732/563406153334128681/753322309942509578) + +Yes, you can have as many separate parameter files as you'd like. It's only +important that they are correctly specified in your DVC pipeline stages. + +For example, if you have files `params_data_processing.yaml` and +`params_model.yaml` in your project (perhaps to store hyperparameters of your +data processing and model fitting stages, respectively), you'll want to call the +right file at each stage. For example: + +```dvc +$ dvc run -n preprocess \ + -p params_data_process.yaml:param1,param2,... +``` + +### [Q: Is there a way to automatically produce SVG plots from `dvc plot`? I don't like having to click through the Vega-Lite GUI to get an SVG, and my plots look so small when I access them in the browser.](https://discordapp.com/channels/485586884165107732/563406153334128681/750012082149392414) + +If your DVC plots (and by DVC plots, we mean Vega-Lite plots 😉) look small in +your browser, you can modify this programmatically! DVC generates Vega-Lite +plots by way of a few templates that come pre-loaded. The templates are in +`.dvc/plots` (assuming you're in a DVC directory). + +Find the template that corresponds to your plot (if you didn't specify a plot +type in your CLI command, it's probably `default.json`) and modify the `height` +and `width` paramters. Then save your changes. + +For more about how to modify your plot templates, check out the +[Vega docs](https://vega.github.io/vega/docs/specification/). If you're +considering making a whole new template that's custom for your data viz needs, +[we've got docs on that](https://dvc.org/doc/command-reference/plots#custom-templates), +too. + +One last tip: did you know about the +[Vega-Lite CLI](https://anaconda.org/conda-forge/vega-lite-cli)? It provides +functions for converting Vega-Lite plots to `.pdf`,`.png`,`.svg`, and `.vg` +(Vega) formats. To use this approach with DVC, you'll want to use the +`--show-vega` flag to print your plot specification to a `.json` file. + +```dvc +$ dvc plots --show-vega > vega.json +$ vl2svg vega.json +``` + +### [Q: I'm confused about external dependencies and outputs. What's the difference?](https://discordapp.com/channels/485586884165107732/485596304961962003/752478399326453840) + +In short, external outputs and dependencies are files or directories that are +tracked by DVC, but physically reside outside of the local workspace. This could +happen for a few reasons: + +- You want to version a dataset in cloud storage that is too large to transfer + to your local workspace efficiently +- Your DVC pipeline writes directly to cloud storage +- Your DVC pipeline depends on a dataset or other file in cloud storage + +An **external output** is declared in two ways: for example, if you have a file +`data.csv` in S3 storage, you can use +`dvc add --external s3://mybucket/data.csv` to begin DVC tracking the file +([there are plenty more details and tips about managing external data in our docs](https://dvc.org/doc/user-guide/managing-external-data))). +You can also declare `data.csv` as an output of a DVC pipeline with +`dvc run -o s3://mybucket/data.csv`. + +An **external dependency** is a dependency of a DVC pipeline that resides in +cloud storage. It's declared with the syntax +`dvc run -d s3://mybucket/data.csv`. + +One other difference to note: DVC doesn't cache external dependencies; it merely +checks if they have changed when you run `dvc repro`. On the other hand, DVC +_does_ cache external outputs. You'll want to set up an +[external cache](https://dvc.org/doc/user-guide/how-to/share-a-dvc-cache#configure-the-shared-cache) +in the same remote location where your files are stored. This is because the +default cache location (in your local workspace) no longer makes sense when the +dataset never "visits" your local workspace! An external cache works largely the +same as a typical cache in your workspace. + +## CML questions + +### [Q: How can I use CML with my own Docker container?](https://discordapp.com/channels/485586884165107732/728693131557732403/757553135840526376) + +In many of our CML docs and videos, we've shown how to get CML on your CI +(continuous integration) runner via a Docker container that comes with +everything installed. But this is not the only way to use CML, especially if you +want workflows to run in your own Docker container. + +You can install CML via `npm`, either in your own Docker container or in your CI +workflow (i.e., in your GitHub Actions `.yaml` or GitLab CI `.yml` workflow +file). + +To install CML as a package, you'll want to run: + +```bash +$ npm i -g @dvcorg/cml +``` + +Note that you may need to install additional dependencies if you want to use DVC +plots and Vega-Lite commands: + +```bash +$ sudo apt-get install -y libcairo2-dev libpango1.0-dev libjpeg-dev libgif-dev \ + librsvg2-dev libfontconfig-dev +$ npm install -g vega-cli vega-lite +``` + +If you're installing CML as part of your workflow, you may need to install Node +first- +[check out our docs](https://github.com/iterative/cml#install-cml-as-a-package) +for how to do this in GitHub Actions and GitLab CI. + +### [Q: After running a GitHub Action workflow that runs a DVC pipeline, I want to save the output of the pipeline. Why doesn't CML automatically save the output?](https://discordapp.com/channels/485586884165107732/728693131557732403/757686601953312988) + +By design, artifacts generated in a CI workflow aren't saved anywhere- they +disappear as soon as the runner shuts down. So a DVC pipeline executed in your +CI system might produce outputs, like transformed datasets and model files, that +will be lost at the end of the run. If you want to save them, there are a few +methods. + +One approach is with auto-commits: a `git commit` at the end of your CI workflow +to commit any new artifacts to your Git repository. However, auto-commits have a +lot of downsides- they don't make sense for a lot of users, and generally, it's +better to re-create outputs as needed than save them forever in your Git repo. + +We created the DVC `run-cache` in part +[to solve this issue](https://stackoverflow.com/questions/61245284/is-it-necessary-to-commit-dvc-files-from-our-ci-pipelines). +Here's how it works: you'll setup a DVC remote with access credentials passed to +your GitHub Action/GitLab CI via CML (see, for example, +[this workflow](https://github.com/iterative/cml_dvc_case/blob/master/.github/workflows/cml.yaml)). +Then you'll use the following protocol in your CI workflow (your workflow config +file in GitHub/GitLab): + +```dvc +$ dvc pull --run-cache +$ dvc repro +$ dvc push --run-cache +``` + +When you use this design, any artifacts of `dvc repro`, such as models or +transformed datasets, will be saved in DVC storage and indexed by the pipeline +version that generated them. You can access them in your local workspace by +running + +```dvc +$ dvc pull --run-cache +$ dvc repro +``` + +While we think this is ideal for typical data science and machine learning +workflows, there are other approaches too- if you want to go deeper exploring +auto-commits, checkout the +[Add & Commit GitHub Action](https://github.com/marketplace/actions/add-commit). + +### [Q: What can CML do that Circle CI can't do?](https://www.youtube.com/watch?v=9BgIDqAzfuA&lc=Ugylt6QR5ClmD8uHe4B4AaABAg) + +To be clear, CML isn't a competitor to Circle CI. Circle CI is more analogous to +GitHub Actions or GitLab CI; it's a continuous integration system. + +CML is a toolkit that works with a continuous integration system to 1) provide +big data management (via DVC & cloud storage), 2) help you write model metrics +and data viz to comments in GitHub/Lab, and 3) orchestrate cloud resources for +model training and testing. Currently, CML is only available for GitHub Actions +and GitLab CI. + +So to sum it up: CML is not a standalone continuous integration system! It's a +toolkit that works with existing systems, which in the future could include +Circle CI, Jenkins, Bamboo, Azure DevOps Pipelines, and Travis CI. Feel free to +[open a feature request ticket](https://github.com/iterative/cml/issues), or +leave a 👍 on open requests, to "vote" for the integrations you'd like to see +most. diff --git a/content/blogs/2020-10-12-october-20-dvc-heartbeat.md b/content/blogs/2020-10-12-october-20-dvc-heartbeat.md new file mode 100644 index 0000000000..291b075a63 --- /dev/null +++ b/content/blogs/2020-10-12-october-20-dvc-heartbeat.md @@ -0,0 +1,164 @@ +--- +title: October ’20 Heartbeat +date: 2020-10-12 +description: > + This month, hear about our international talks, new video docs on our YouTube + channel, and the best tutorials from our community. +descriptionLong: > + This month, hear about our international talks, new video docs on our YouTube + channel, and the best tutorials from our community. +picture: 2020-10-12/cover.png +pictureComment: + Double DeeVee! One of these birds is on a layover before heading to Germany. +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/october-20-heartbeat/527 +tags: + - Heartbeat + - CML + - DVC + - Tutorial + - Conference + - Meetup + - YouTube +--- + +## News + +### Paweł gets ready to speak at Poland's largest data science meeting + +DVC developer Paweł Redzyński (he's written a lot of the code behind +`dvc plots`) is giving at talk at the [Data Science Summit](https://dssconf.pl/) +in Poland! The virtual meeting is on October 16, but talks are available for +streaming on demand up to a week before. Paweł's talk is part of the DataOps & +Development track, where he'll be sharing about CML and GitHub Actions (note +that it'll be delivered in English). + +[![](../uploads/images/2020-10-12/dss.png)](https://dssconf.pl) + +### Dmitry talks at Data Engineering Melbourne + +CEO +[Dmitry Petrov dropped into the Data Engineering Melbourne meetup](https://www.meetup.com/Data-Engineering-Melbourne/events/267033998/) +to talk about Data Versioning and DataOps! He spoke about the differences +between end-to-end platforms and ecosystems of tools, and how this distinction +informs the development of software like DVC and CML (hint: we picked tools over +platforms). + +Keep an eye on this meetup, which is now accessible to folks on all continents +thanks to the magic of the internet :) + + + +### Elle has talks at PyCon India and PyData Global + +Last week I gave a talk about CML at +[PyCon India](https://in.pycon.org/cfp/2020/proposals/how-to-make-continuous-integration-work-with-machine-learning~avK5b/), +and have another one coming up at +[PyData Global](https://global.pydata.org/talks/321) this November 11-15. + + + +PyData Global has a fantastic lineup of talks spanning science and engineering, +so please consider joining! + +### DVC at DataFest + +DVC Ambassador Mikhail Rozhkov co-hosted the Machine Learning REPA +(Reproducibility, Experiments and Pipelines Automation) track of +[DataFest 2020](https://datafest.ru/), and DVC showed up in full force! There +were talks from Dmitry, ambassador Marcel Ribeiro-Dantas, and myself about all +aspects of MLOps and automation. + +DataFest is over (until next year, anyway), but +[visit the ML-REPA community](http://ml-repa.ru/en#about) for ongoing content +and opportunities for networking. + +### New videos + +Since the summer, we've been building our +[YouTube channel](https://www.youtube.com/channel/UC37rp97Go-xIX3aNFVHhXfQ). +It's going great- we've gotten more than 18,000 views in the last few months and +1,500 subscribers! + +Our latest video in the +[MLOps Tutorials](https://www.youtube.com/playlist?list=PL7WG7YrwYcnDBDuCkFbcyjnZQrdskFsBz) +series introduced using GitHub Actions for model testing- instead of training a +model in continuous integration, the idea is to train locally and "check-in" +your favorite model for testing in a standardized environment. This approach +lets you completely control the environment, infrastructure, and code used to +evaluate your model, and save the run in a place that's easy to share (GitHub!). + +https://youtu.be/bSXUJRnQPPo + +We'll be going deeper into the art and craft of testing ML models in the next +few weeks, so stay tuned. Another big initative is adding videos to our docs: +since video seems like a popular format for a lot of learners, we're working to +supplement our official docs with embedded videos. Check out our first +installment on the +[Getting Started with Data Versioning](https://dvc.org/doc/start/data-and-model-versioning). + +https://youtu.be/kLKBcPonMYw + +## From the community + +Our community makes some amazing tutorials. Here are a few on our radar: + +Data scientist and full-stack developer +[Ashutosh Hathidara](https://github.com/ashutosh1919) shared an end-to-end +machine learning project made with DVC and CML... and released it in video form! +It's a neat setup and a nice model for folks to study. + +https://youtu.be/H1VBsK7XiKs + +Another detailed and easy-to-follow tutorial, with a similarly impressive scope, +appeared on [Heise Online](https://www.heise.de/). This project puts together +DVC, Cortex, and ONNX to develop and deploy a model trained on the Fashion MNIST +dataset (note: the article is in German, and I read it with Chrome's English +translation). + + + +You'll also want to check out [anno.ai](https://www.anno.ai/)'s tutorial about +managing large datasets with DVC and S3 storage- it's detailed, but also a +quick-start guide informed by the team's practical experience. + + + +Data scientist and mathematician [Khuyen Tran](https://twitter.com/KhuyenTran16) +blogged about why and how to start using DVC- and her tutorial includes Google +Drive remote storage, a feature we're especially excited about. Check it out and +follow along with her code examples! + + + +And to end on a thoughtful note... have you seen this thread by ML Engineer +[Shreya Shankar](https://twitter.com/sh_reya)? She beautifully summarizes many +of the ideas and technical challenges our community thinks about every day. Read +and reflect! + +https://twitter.com/sh_reya/status/1314338372073263112 diff --git a/content/blogs/2020-10-26-october-20-community-gems.md b/content/blogs/2020-10-26-october-20-community-gems.md new file mode 100644 index 0000000000..67298fa0d7 --- /dev/null +++ b/content/blogs/2020-10-26-october-20-community-gems.md @@ -0,0 +1,170 @@ +--- +title: October '20 Community Gems +date: 2020-10-26 +description: > + A roundup of technical Q&A's from the DVC community. This month, learn how DVC + files work, how to use DVC plots for multi-class classification problems, and + how to deal with some spooky error messages 👻. +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month, learn how DVC + files work, how to use DVC plots for multi-class classification problems, and + how to deal with some spooky error messages 👻. +picture: 2020-10-26/Gems_Oct_20.png +pictureComment: | + Happy Halloween from Pirate DeeVee! +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/october-20-community-gems/535 +tags: + - Community Gems + - CML + - Vega + - Metrics +--- + +## DVC questions + +### [Q: What's in a `.dvc` file, and what would happen if decided not push my `.dvc` files to my Git repo?](https://discordapp.com/channels/485586884165107732/485596304961962003/760920403064520755) + +DVC creates lightweight metafiles (`.dvc` files) that correspond to large +artifacts in your project. These `.dvc` files contain pointers to your artifacts +in remote storage (we use a simple content-based storage scheme). Because we use +content-based storage, the remote storage itself isn't designed for browsing +(although +[there are some discussions](https://github.com/iterative/dvc/issues/3621) about +how to make stored files more "discoverable", and you can always identify them +manually by their contents and meta-information like timestamps). + +Your `.dvc` files help establish meaningful links between human-readable +filenames and file contents in remote storage, as well as to use Git versioning +on your stored datasets and models. You can think of your DVC remote storage as +a _compliment_ to your Git repository, not a replacement. + +In other words... if you're not Git versioning your `.dvc` files, you're not +versioning anything in DVC remote storage! + +### [Q: Can I limit the number of network connections used by DVC during `dvc pull`?](https://discordapp.com/channels/485586884165107732/485596304961962003/739760523293360182) + +Yep- by default, DVC data transfer operations use a number of threads +proportional to the number of CPUs detected. But, there's a handy flag for +`dvc pull` and `dvc push` that lets you override the defaults: + +```dvc +-j , --jobs - number of threads to run +simultaneously to handle the downloading of files from +the remote. The default value is 4 * cpu_count(). For +SSH remotes, the default is just 4. Using more jobs may +improve the total download speed if a combination of small +and large files are being fetched. +``` + +### [Q: I'm working on a multi-class classification task. Can `dvc plots` show multiple precision recall curves- one for each class?](https://discordapp.com/channels/485586884165107732/485596304961962003/765117500530491472) + +Currently, `dvc plots` doesn't support multiple linear curves on a single plot +(except for `dvc plots diff`, of course!). But, you could make one precision +recall curve per class and display them side-by-side. + +To do this, you'd want to write the precision recall curve values to separate +files for each class (`prc-0.json`,`prc-1.json`, etc.). Then you would run: + +```dvc +$ dvc plots show prc-0.json prc-1.json +``` + +And you'll see two plots side-by-side! A benefit of this approach is that when +you run `dvc plots diff` to compare precision recall curves across Git commits, +you'll get a comparison plotted for each class. + +### [Q: Are you sure I should commit my `.dvc/config` file? It contains my logging credentials for storage, and I'm nervous about adding it to a shared Git repository.](https://discordapp.com/channels/485586884165107732/563406153334128681/768770079596740650) + +This is a common scenario- you don't necessarily want to broadcast your remote +storage credentials to everyone on your team, but you still want to check-in +your DVC setup (meaning, your `.dvc/config` file). In this case, you want to use +a `local` config file! + +You can use the command + +```dvc +$ dvc config --local +``` + +to setup remote credentials that will be stored in `.dvc/config.local`- by +default, this file is in your `.gitignore` so you don't have to worry about +accidentally committing secrets to your Git repository. +[Check out the docs](https://dvc.org/doc/command-reference/config) for more, +including the `--system` and `--global` options for setting your configuration +for multiple projects and users respectively. + +## CML Questions + +### [Q: What's the file size limit for publishing files with `cml publish`?](https://discordapp.com/channels/485586884165107732/728693131557732403/751001285100306502) + +`cml publish` is a service for hosting files that are embedded in CML reports, +like images, audio files, and GIFS. By default, we have a limit of 2 MB per +upload. + +If your files are larger than this (which can happen, depending on the machine +learning problem you're working on!) we recommend using GitLab's artifact +storage. +[Based on discussions in the community](https://github.com/iterative/cml/issues/232), +we recently implemented a CML flag (`--gitlab-uploads`) to streamline the +process: + +```dvc +$ cml publish movie.mov --md --gitlab-uploads > report.md +``` + +Note that we don't currently have an analagous solution for GitHub, because +GitHub artifacts expire after 90 days (whereas they're permanent in GitLab). + +### [Q: I'm getting a mysterious error message, `Failed guessing mime type of file`, when I try to use `cml publish`. What's going on?](https://discordapp.com/channels/485586884165107732/728693131557732403/763840404675756042) + +This error message usually means that the target of `cml publish`- for example, + +```dvc +$ cml publish +``` + +is not found. Check for typos in the target filename and ensure that the file +was in fact generated during the run (if it isn't part of your Git repository). +We've [opened an issue](https://github.com/iterative/cml/issues/308) to add a +more informative error message in the future. + +### [Q: In my GitHub Actions workflow, I use `dvc metrics diff` to compare metrics generated during the run to metrics on the main branch and print a table- but the table isn't showing any of the metrics from `main`. What could be happening?](https://discordapp.com/channels/485586884165107732/728693131557732403/768815157034876929) + +When a continuous integration runner won't report metrics from previous versions +of your project (or other branches), that's usually a sign that the runner +doesn't have access to the full Git history of your project or your metrics +themselves. Here are a few things to check for: + +1. **Did you fetch your Git working tree in the runner?** Functions like + `dvc metrics diff` require the Git history to be accessible- make sure that + in your workflow, before you run this function, you've done a `git fetch`. We + recommend: + +```dvc +$ git fetch --prune --unshallow +``` + +2. **Are your metrics in your DVC remote?** If your metrics are _cached_ (which + they are by default when you create a DVC pipeline), your DVC remote should + be accessible to your runner. That means you need to add any credentials as + repository secrets (or variables, in GitLab), and do `dvc pull` in your + workflow before attempting `dvc metrics diff`. + +3. **Are your metrics in your local workspace?** If you are _not_ using a DVC + remote, your metric files must be _uncached_ and committed to your Git + repository. To explore an example, say you have a pipeline stage that creates + `metric.json`: + +```dvc +$ dvc run -n mystage -m metric.json train.py +``` + +By default, `metric.json` is cached and ignored by Git- which means that if you +aren't using a DVC remote in your CI workflow, `metric.json` will effectively be +abandoned on your local machine! You can avoid this by using the `-M` flag +instead of `-m` in `dvc run`, or manually adding the field `cache: false` to +your metric in `dvc.yaml`. Be sure to remove your metrics from any `.gitignore` +files, and commit and push them to your Git repository. + +That's all for this month- Happy Halloween! Watch out for scary bugs. 🐛 diff --git a/content/blogs/2020-11-11-november-20-dvc-heartbeat.md b/content/blogs/2020-11-11-november-20-dvc-heartbeat.md new file mode 100644 index 0000000000..dd1c219ce7 --- /dev/null +++ b/content/blogs/2020-11-11-november-20-dvc-heartbeat.md @@ -0,0 +1,172 @@ +--- +title: November ’20 Heartbeat +date: 2020-11-11 +description: > + Catch our monthly updates- featuring new video docs and talks, new jobs at + DVC, and must-read contributions from the community about MLOps, data science + with R, and ML in production. +descriptionLong: > + Catch our monthly updates- featuring new video docs and talks, new jobs at + DVC, and must-read contributions from the community about MLOps, data science + with R, and ML in production. +picture: 2020-11-08/cover.png +pictureComment: Double DeeVee! +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/november-20-heartbeat/554 +tags: + - Heartbeat + - CML + - DVC + - Tutorial + - Conference + - R + - MLOps +--- + +## News + +Welcome to the November Heartbeat! Let's dive in with some news from the team. + +### DataCouncil interviews Dmitry + +[Data Council](https://twitter.com/DataCouncilAI)'s +[Peter Soderling](https://twitter.com/petesoder?lang=en) interviewed CEO Dmitry! +Check out the recording from Data Council's live event, including Q&A from the +Data Council community, on YouTube. + +https://youtu.be/8dBCgIa7TGE + +### We're hiring + +Did you know we're hiring for two roles in our growing team? We're looking for: + +- A + [**Senior Software Engineer**](https://weworkremotely.com/remote-jobs/iterative-senior-software-engineer-open-source-dev-tools-3) + for the core DVC team- someone with strong Python development skills who can + build and ship essential DVC features. + +- A + [**Developer Advocate**](https://weworkremotely.com/remote-jobs/iterative-developer-advocate) + to lead the community, support contributors and new users, and create new + content like blogs and videos about DVC and CML. + +Here are a few reasons to consider joining us: + +- Your work will be visible and will be used by thousands developers every day! +- We're a small, fully remote team. Work from anywhere! +- Competitive salary and benefits +- Family-friendly benefits, including unlimited PTO + +If you're interested, we'd love to hear from you about either role (and we +welcome referrals if you know a good candidate)! + +### New videos + +We're continuing to develop our video docs, and now half of our "Getting +Started" section has video accompaniments. Check out our latest release on +[data access with DVC](https://dvc.org/doc/start/data-and-model-access): + +https://youtu.be/EE7Gk84OZY8 + +This video covers functions like `dvc get`, `dvc import`, and the DVC Python +API. + +We took a quick break from releasing videos during the US election week, but +look out for a new video on our +[YouTube channel](https://www.youtube.com/channel/UC37rp97Go-xIX3aNFVHhXfQ) +about model testing with continuous integration! Subscribe to get alerts +whenever we have something new :) + +### Workshops and conferences + +As usual, there are plenty of remote meetings on our schedules: + +- [HealthData Bootcamp](http://www.bootcamp.dadosesaude.com/) is a weeklong + intensive for all things biomedical data science. Dmitry and myself (Elle), + plus DVC Ambassadors Mikhail Rozhkov and Marcel Ribeiro-Dantas, will be + presenting lectures and workshops about MLOps throughout the week! + +- I'll be leading a hands-on workshop at the + [Toronto Machine Learning Society Annual Meeting](https://torontomachinelearning.com/). + It'll cover how to get started using + [Continuous Machine Learning](https://cml.dev)(CML) with GitHub Actions- + [register here](https://torontomachinelearning.com/), and be sure to reserve + your spot in the workshop. + +- This week, I have another talk at [PyData Global](https://global.pydata.org/) + about CML. PyData Global is online for the first time ever and promises to be + a great gathering for Python-using data scientists in industry and academic + research alike. + +## From the community + +Here are some of our favorite happenings around the MLOps community this week. + +### A new online course + +[Goku Mohandas](https://twitter.com/GokuMohandas), founder of +[Made with ML](https://twitter.com/madewithml), announced plans to release a new +online course about putting ML in production. The curriculum will cover +everything from experiment tracking to deploying and monitoring models in +production, and you can expect DVC to be included! Keep an eye on Goku and Made +with ML on Twitter for updates. + +https://twitter.com/GokuMohandas/status/1315990996849627136 + +### Our favorite blogs + +[Dr. Larysa Visengeriyeva](https://twitter.com/visenger), creator of the +top-notch +["Awesome MLOps" GitHub repo](https://github.com/visenger/awesome-mlops), and +DevOps expert Anja Kammer wrote a must-read essay about CI/CD for ML (note: it's +published in German; I used Chrome's built-in translation to read in English). + +The blog covers key concepts like continuous integration, deployment, and +training with ML, as well as practical approaches and sample architectures. + + + +_Also_, there's some cool art. + +![](../uploads/images/2020-11-08/mlops_diagram.png) + +Another blog on our radar: [Sean Lopp](https://twitter.com/lopp_sean) at +[RStudio](https://twitter.com/rstudio) made the first known blog about a CML +report with a ggplot! Using RStudio's +[GitHub Actions for R](https://github.com/r-lib/actions) and CML, Sean built a +sample data science workflow that runs automatically in GitHub Actions on a +push. He reports on some pros, cons, and areas for future development to make R +language data science easy to automate. + + + +Finally, developer [Petr Stribny](https://twitter.com/stribny) wrote about how +to version big files in a Git project with DVC. It's a short-and-sweet guide to +getting started, and if you're trying to decide if DVC is for you, this is worth +a look. + + + +### A nice tweet + +To wrap it up, here's a kind tweet that we really like. It's always good to be +mentioned in the same tweet as some of our heroes :) + +https://twitter.com/ethanjb/status/1316833012676354048 + +Thanks for reading this month! diff --git a/content/blogs/2020-11-25-november-20-community-gems.md b/content/blogs/2020-11-25-november-20-community-gems.md new file mode 100644 index 0000000000..3ad37077d4 --- /dev/null +++ b/content/blogs/2020-11-25-november-20-community-gems.md @@ -0,0 +1,212 @@ +--- +title: November '20 Community Gems +date: 2020-11-25 +description: > + A roundup of technical Q&A's from the DVC community. This month, learn how to + clean your cache and use Git hooks with DVC. And here's an early holiday gift- + new Bitbucket support for CML! +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month, learn how to + clean your cache and use Git hooks with DVC. And here's an early holiday gift- + new Bitbucket support for CML! +picture: 2020-11-25/cover.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/november-20-community-gems/566 +tags: + - Community Gems + - CML + - Cache + - Bitbucket +--- + +## DVC questions + +### [Q: If I checkout a different Git branch, how do I synchronize with DVC?](https://discord.com/channels/485586884165107732/485596304961962003/773498570795778058) + +Here's what we recommend: when you checkout a different Git branch in your +project: + +```dvc +$ git checkout -b +``` + +you'll want to next run + +```dvc +$ dvc checkout +``` + +to synchronize your `.dvc` files on that branch. But _did you know_ you can +automate this with a `post-checkout` Git hook? We've got a hook that executes +`dvc checkout` whenever you run `git checkout`, so you'll always have the +correct data file versions. Head to our docs to +[read up on installing Git hooks into your DVC repository](https://dvc.org/doc/command-reference/install#install) +so you never forget to `dvc checkout`! + +### [Q: I have a big, 100 GB directory. I want to know where the contents are located so I can open them with Spark- is there a way to get the location of my files without caching them locally?](https://discord.com/channels/485586884165107732/485596304961962003/771386223403073587) + +For this, we'd recommend the +[DVC Python API](https://dvc.org/doc/api-reference/get_url#dvcapiget_url)'s +`get_url` function. For example, in a Python script you'd write: + +```python +import dvc.api + +resource_url = dvc.api.get_url( + "", + repo="https://github.com/") +) +``` + +This code means the API will return the URL for a file that ends in `.dir`. The +`.dir` file contains a JSON-formatted table of the hashes and relative paths for +all the files inside ``. You could then parse that file to +get the relative paths to the files in your remote storage. + +The JSON object will look something like this, for a file `foo/bar` in your +project: + +```json +{ "md5": "abcd123", "relpath": "foo/bar" } +``` + +Then you can convert the relative path to `foo/bar` to an absolute path as +follows: + +```dvc +https:///ab/cd123 +``` + +To better understand how DVC uses +[content-addressable storage](https://en.wikipedia.org/wiki/Content-addressable_storage) +in your remote, +[read up in our docs](https://dvc.org/doc/user-guide/dvc-internals#structure-of-the-cache-directory). + +### [Q: Can I have more than one `dvc.yaml` file in my project?](https://discord.com/channels/485586884165107732/563406153334128681/777946398250893333) + +By default, DVC pipelines records all your stages (and their inputs and outputs) +in a single file, `dvc.yaml`. Per directory, you can have one `dvc.yaml` file. +If you want to run pipelines in a different folder than your project root, you +could create another `dvc.yaml` in a subdirectory. + +However, `dvc.yaml` is intended to be the only file you need to record and +reproduce pipelines per directory. Pipelines are designed to have all stages +stored in the same place, and there's currently no method to rename `dvc.yaml`. + +### [Q: How can I untrack a file that's being tracked by DVC? I want to remove it from remote storage and my local cache, too.](https://discord.com/channels/485586884165107732/563406153334128681/773277514717462548) + +If you want to untrack a file, perhaps something you added to DVC in error, you +can use `dvc remove` to get rid of the `.dvc` file corresponding to your file, +and then clear your DVC cache with `dvc gc -w --cloud`. +[Check out our docs](https://dvc.org/doc/user-guide/how-to/stop-tracking-data) +to learn more about `dvc gc` and what its flags mean (you'll want to be sure you +know what you're doing, since cache cleaning deletes files permanently!). + +Alternatively, you can manually find and delete your files: + +1. Find the file using its hash from the corresponding `.dvc` file (or, if it's + part of a pipeline, the `dvc.lock` file). +2. Look in your remote storage and remove the file matching the hash. +3. Look in `.dvc/cache` and remove the file as well. If you'd like to better + understand how your cache is organized, + [we have docs for that](https://dvc.org/doc/user-guide/dvc-internals#structure-of-the-cache-directory). + +Your DVC remote storage and cache are simply storage locations, so once your +file is gone from there it's gone for good. + +### [Q: My DVC cache is getting a bit big. Can I clean it?](https://discord.com/channels/485586884165107732/563406153334128681/771275051382341674) + +Definitely. Have you seen the command `dvc gc`? It helps you clean your local +cache- [read up here](https://dvc.org/doc/command-reference/gc). This function +lets you get granular about what you're keeping; for example, you can instruct +`dvc gc` to preserve cache files that are currently used your local worksapce, +tips of Git branches, tagged Git commits or all Git commits. Everything else +will be removed. + +One word of caution: make sure that when you collect garbage from your cache, +you don't delete any files that you haven't yet pushed to a remote. If this +happens, you'll delete them permanently. To be safe, it never hurts to +`dvc push` your files of interest before cleaning. + +## CML questions + +### [Q: Does CML support Bitbucket?](https://github.com/iterative/cml/issues/140) + +We've just unrolled Bitbucket Cloud support! There are brand new docs in the CML +project repo, +[so check them out](https://github.com/iterative/cml/wiki/CML-with-Bitbucket-Cloud) +to get started. A few quick notes to keep in mind: + +1. Like GitLab, Bitbucket Cloud requires you to create a token for authorizing + CML to write comments. Make sure you don't forget this step (it's in the + docs!) or you'll surely hit a permissions error. + +2. Bitbucket Cloud uses Bitbucket Pipelines for continuous integration + workflows, which + [currently doesn't support self-hosted runners](https://jira.atlassian.com/browse/BCLOUD-16995). + That means + [bringing your own GPUs is not supported](https://community.atlassian.com/t5/Bitbucket-questions/Does-bitbucket-pipe-support-GPUs-yet/qaq-p/1042659). + Sorry! But you can still have all the other CML benefits of plots, tables and + text in your Pull Request. + +3. Bitbucket Server support (with Jenkins and Bamboo) is under active + development. Stay tuned! + +![](../uploads/images/2020-11-25/bitbucket_cloud_pr.png)_Now your Bitbucket PRs +can be as pretty as you._ + +### [Q: Can I use CML with Windows runners?](https://discord.com/channels/485586884165107732/728693131557732403/772519007894765600) + +While all our CML tutorials and docs use Ubuntu runners of various flavors, +there's no problem with using Windows runners. Both +[GitHub Actions](https://docs.github.com/en/free-pro-team@latest/actions/reference/specifications-for-github-hosted-runners) +and +[GitLab CI](https://about.gitlab.com/blog/2020/01/21/windows-shared-runner-beta/) +have Windows runners up for grabs. And of course, you can set up your own +Windows machine as a self-hosted runner (see the self-hosted runner docs for +your CI system to learn more). + +What if you have a GPU? If you want to use +[`nvidia-docker` to put GPU drivers in your container](https://dvc.org/blog/cml-self-hosted-runners-on-demand-with-gpus), +you'll want to use `nvidia-docker` with the Windows Subsytem for Linux (WSL). +That means you'll first install an Ubuntu subsystem on your Windows machine, +then all your Nvidia drivers, then Docker and `nvidia-docker`. Check out some +[more docs about CUDA with WSL](https://docs.nvidia.com/cuda/wsl-user-guide/index.html) +to lear more. + +### [Q: I'm using CML to deploy a self-hosted runner with GitLab. I noticed that in your docs, the runner is always set to timeout after 1800 seconds, and then it gets unregistered from GitLab. What if I want to keep my runner registered after the job ends?](https://discord.com/channels/485586884165107732/728693131557732403/779317571354099722) + +With CML, we introduced an approach using Docker Machine to provision instances +in the cloud, and then use `dvc run` to register them as self-hosted runners to +completed your workflow. As this question points out, we like to set runners to +timeout after 1800 seconds- that's why you'll see this code in our +[sample "Cloud GPU" workflow](https://github.com/iterative/cml_cloud_case/blob/master/.github/workflows/cml.yaml): + +```dvc +$ sudo docker run --name myrunner -d --gpus all \ + -e RUNNER_IDLE_TIMEOUT=1800 \ + -e RUNNER_LABELS=cml,gpu \ + -e RUNNER_REPO=$CI_SERVER_UR \ + -e repo_token=$REGISTRATION_TOKEN \ + -e RUNNER_DRIVER=gitlab \ + iterativeai/cml:0-dvc2-base1-gpu runner +``` + +We did this so you'll avoid running up GPU hours and a big bill. If you're not +worried about that, though, you can set the environmental variable +`RUNNER_IDLE_TIMEOUT` in the `dvcorg/cml` container to 0. Then, your self-hosted +runner will stay on forever, or at least until you manually turn it off. + +By the way... stay tuned for a big update here. We're currently replacing the +Docker Machine approach with a method based on TerraForm, and we can't wait to +unveil it. It should make deploying cloud instances on AWS, GCP and Azure work +with less code than ever. + +### Q: What did DeeVee do for Thanksgiving? + +She stayed home and made mashed potatoes. + +![](../uploads/images/2020-11-25/deevee_n_taters.png) + +That's all for now, everyone! As always, keep in touch with all your questions +big and small. diff --git a/content/blogs/2020-11-26-dvc-vs-rclone.md b/content/blogs/2020-11-26-dvc-vs-rclone.md new file mode 100644 index 0000000000..043b9b08ab --- /dev/null +++ b/content/blogs/2020-11-26-dvc-vs-rclone.md @@ -0,0 +1,431 @@ +--- +title: 'Cloud Data Sync Methods and Benchmark: DVC vs Rclone' +date: 2020-11-26 +description: > + DVC 1.0 optimized data synchronization to and from remote storage. Here's how + we did it. +descriptionLong: > + Synchronizing data to and from remote storage requires addressing an often + overlooked performance bottleneck: Determining which files to upload and + download. Here we'll outline the general methods used to solve this problem, + and investigate each method's effects on performance by comparing benchmark + results from DVC and rclone. We'll then conclude with a more in-depth + explanation of the optimizations made in DVC 1.0 which enabled us to + outperform both older DVC releases as well as general data sync tools like + rclone. +commentsUrl: https://discuss.dvc.org/t/cloud-data-sync-methods-and-benchmark-dvc-vs-rclone/562 +tags: + - Rclone + - Performance + - Engineering + - Benchmark + - Tutorial +picture: 2020-11-26/header.png +author: peter_rowlands +--- + +Many general-use tools are available for synchronizing data to and from cloud +storage, some widely used options are [rsync](https://rsync.samba.org/), +[rclone](https://rclone.org/) and +[aws sync](https://docs.aws.amazon.com/cli/latest/reference/s3/sync.html), each +with their own advantages and disadvantages. Likewise, in [DVC](/) we provide +the ability to efficiently sync versioned datasets to and from cloud storage +through a git-like push and pull +[interface](https://dvc.org/doc/start/data-management/data-versioning). + +Given that transferring data over a network to and from cloud storage is an +inherently slow operation, it's important for data sync tools to optimize +performance wherever possible. While the data transfer itself may be the most +apparent performance bottleneck in the data sync process, **here we'll cover a +less obvious performance issue: How to determine which files to upload and +download.** + +In this post, we'll outline the general methods used to solve this problem, and +investigate each method's effects on performance by comparing benchmark results +from DVC and rclone. We'll then conclude with a more in-depth explanation of new +optimizations made in DVC 1.0 which enabled us to outperform both older DVC +releases as well as general data sync tools (like rclone). + +_Note: "Cloud storage" and "remote storage" will be used interchangeably +throughout this post. When discussing dataset size in this post, we mean size in +terms of total number of files in a dataset, rather than the total amount of +file data (bytes)._ + +### Outline + +- [Why a "trivial" problem has a not-so-trivial performance impact](#why-a-trivial-problem-has-a-not-so-trivial-performance-impact) +- [Real-world numbers - DVC and rclone performance examples](#real-world-numbers---dvc-and-rclone-performance-examples) +- [How DVC 1.0 speeds things up](#how-dvc-10-speeds-things-up) +- [Conclusion](#conclusion) + +## Why a "trivial" problem has a not-so-trivial performance impact + +At the start of any data sync operation, we must first do the following steps, +in order to determine which files to upload and download between the local +machine and cloud storage: + +1. Determine which files are present locally. +2. Query the cloud storage API to determine which files are present in the + cloud. +3. Compute the difference between the two sets of files. + +Once this difference in file status has been determined, the necessary files can +be copied to or from cloud storage as needed ("file status" meaning file +existence as well as other potential status information, such as modification +time). **While this may seem like a trivial problem, the second step is actually +a significant potential performance bottleneck.** + +In general, cloud storage APIs provide two possible ways to determine what files +are present in cloud storage, and it's up to the data sync tool to select which +method to use. Even for an operation as simple as synchronizing a single local +file to cloud storage, choosing incorrectly between these two options could +actually mean the difference between that "simple" operation taking several +hours to complete instead of just a few seconds. + +_Note: The term "file status query" will be used throughout this post when +referring to this type of cloud storage API query._ + +### Method 1: Query individual files + +The first query method is to individually check whether or not particular files +exist in cloud storage, one at a time. + +_Ex: The S3 API provides the `HeadObject` method.`_ + +When using this method, performance depends on the number of files being +queried - for a single file, it would take a single API request, for 1 million +files, it would take 1 million API requests. In this case, the overall amount of +time it will take to complete the full operation will scale with the number of +files to query. + +One particular advantage to using this method is that it can be easily +parallelized. Overall runtime can be improved by making simultaneous API +requests to query for multiple files at once. + +### Method 2: Query full remote listing + +The second query method is to request the full listing of files present in cloud +storage, all at once. + +_Ex: The S3 API provides the `ListObjects` method._ + +With this method, the overall amount of time it will take to complete the full +operation scales with the total number of files in cloud storage, rather than +the number of files we wish to query. + +It's important to note that when using this method, cloud APIs will only return +a certain number of files at a time (the amount returned varies depending on the +API). This means that for an API which returns 1000 files at a time (such as +S3), retrieving the full listing of a remote containing 1000 files or less would +would only take a single API request. Listing a remote which contains 1 million +files would take 1000 API requests. + +Another important note is that API calls for this method must be made +sequentially and can't be easily parallelized. Using S3 as an example, the first +API call would return files 0 through 999. The next call would return files 1000 +through 1999, and so on. However, the API provides no guarantee of ordering, and +API calls must be made sequentially, until the full list has been retrieved. So +we can't make two simultaneous requests for both "files 1-999" and "files +1000-1999". + +### How selecting one method or the other can drastically improve performance + +Consider an example scenario where a dataset being synchronized contains 100 +local files, and we need to check which of those files exist in cloud storage. +For the purposes of this example, we'll also assume that all individual API +calls take the same amount of time to complete, and that we are not running any +tasks in parallel. Additionally, let's say that our example cloud storage API +returns 1000 files per page when using query method 2. + +In this situation, we know that the first query method will always take a fixed +number of API calls to complete (100). The number of API calls required for the +second query method depends on the total number of files that already exist in +the remote. + +Since we know that the API returns 1000 results per API call, we can say that if +the remote contains less than `1000 * 100 = 100,000` files, fetching the full +remote listing (method 2) will be faster than checking each file individually, +since it will take less than 100 API calls to complete. In the case that the +remote contains 1000 or less files, method 2 would only require a single API +call (potentially outperforming method 1 by 100x). + +However, if the remote contains anything over this 100,000 threshold, method 1 +will be faster than method 2, with the difference in performance between the two +methods scaling linearly as the potential remote size increases. + +**Total API calls required to query 100 local files from S3** +![API calls](../uploads/images/2020-11-26/api_calls_100_local.svg 'API calls required to query 100 local files from S3') + +This example illustrates an important point. Given a (relatively) small set of +files to query and a sufficiently large remote, method 1 will always be faster +than method 2. + +Thinking about it from a different perspective, what happens if we have the +ability to reduce the size of a (relatively) large query set? + +Once our query set is smaller than a certain threshold, we'll be able to use +method 1 rather than method 2. On top of that, we know that the runtime of +method 1 scales with query set size. **In simple terms, by reducing the size of +our query set as much as possible, we can also improve performance.** + +So, as we have shown, choosing the optimal method depends on both: + +- The number of files that we need to query. +- The total number of files in the remote. + +_Note: In terms of real world performance, there are other considerations that +DVC must account for, such as different API calls taking different amounts of +time to complete, parallelization, and the amount of time it takes to run list +comparison operations in Python._ + +## Real-world numbers - DVC and rclone performance examples + +Now let's take a look at some real-world numbers to examine the impact selecting +one query method or the other has on data sync performance in DVC and rclone. +Both tools can utilize either potential query method, with some differences: + +- In rclone, the user can specify the `--no-traverse` option to select the first + query method, otherwise rclone will default to the second method in most + situations (with the exception being cases with very small query set sizes). +- In DVC prior to 1.0, the first query method would be used by default for all + supported cloud storage platforms except Google Drive, and the user could + specify one method or the other via the `no_traverse` configuration option. +- **In DVC 1.0 and later, the optimal query method is selected automatically.** + +In the following scenarios, we are simulating the typical DVC use case in which +a user tracks a local directory containing some number of files using DVC, and +then synchronizes the DVC-tracked directory to cloud storage (S3 in these +examples) using either DVC or rclone. The user would then continually repeat a +process of: + +1. Modify a small subset of files in the directory. +2. Push the updated version of the directory into cloud storage. + +Keep in mind that for DVC's purposes, we are most interested in optimizing +performance for scenarios which are normally very slow to complete. If you +consider an operation which previously took several hours to complete, improving +that runtime down to a few minutes will have a much greater impact for our users +versus shaving a few seconds off of an operation which previously took under a +minute to run. + +_Note: For these benchmarks we are only interested in the amount of time +required to determine file status for this one-way push operation. So the +runtimes in each case are for status queries only (using `dvc status -c` in DVC +and `rclone copy --dry-run` in rclone). No file data was transferred to or from +S3 in any of these scenarios._ + +_Benchmark command usage:_ + +```dvc +$ time dvc status -c -r remote +$ time rclone copy --dry-run --progress --exclude "**/**.unpacked/" .dvc/cache remote:... +``` + +_rclone run with `--no-traverse` where indicated_ + +_Benchmark platform: Python 3.7, macOS Catalina, DVC installed from pip, +dual-core 3.1GHz i7 cpu_ + +**Local directory w/100k total files, S3 bucket w/1M total files (1 file +modified since last sync)** +![benchmarks](../uploads/images/2020-11-26/dvc_rclone_bench.svg 'DVC 1.0 vs rclone performance comparison') + +The previous chart contains benchmarks for a scenario in which the local +directory contains 100,000 files, and the S3 bucket contains approximately 1 +million files. One file in the local directory has been modified since the +directory was last synchronized with the S3 bucket. This scenario tests the +length of time it takes DVC or rclone to determine (and report to the user) that +only the one modified file is missing from the S3 bucket and needs to be +uploaded. + +This illustrates DVC's performance advantage over rclone with regard to +synchronizing iterations of a versioned dataset over time, as well as the DVC +1.0 performance improvements over prior releases. + +_Note: In these examples, the local file count refers to the number of files +inside the original tracked directory. The number of files present in the DVC +cache will differ slightly, since the DVC cache will contain an additional file +representing the tracked directory itself, but the end result is that both DVC +and rclone will both need to query for the same number of files (i.e. the number +of files in the cache directory)._ + +**Local directory w/1 file, S3 bucket w/1M total files** +![benchmarks](../uploads/images/2020-11-26/dvc_rclone_bench2.svg 'DVC 1.0 vs rclone performance comparison') + +In this example, we are testing a simple scenario in which the local directory +contains 1 file and the S3 bucket contains approximately 1 million files. + +In this case, in DVC 0.91 we essentially get lucky that our default choice for +S3 happens to be the first query method. If we ran this same scenario with a +Google Drive remote (where the 0.91 default choice is the second query method) +instead of S3, we would see a very long runtime for DVC 0.91. + +Also note that here, rclone is able to determine that with a single local file +to query, it should use the first query method instead of defaulting to the +second method. + +_Note: We are unsure of the reason for the rclone runtime difference with and +without `--no-traverse` for this scenario, but rclone does do some computation +to determine whether or not to default to `no-traverse` behavior for small query +sets. It's likely that specifying `--no-traverse` allows rclone to skip that +overhead entirely in this case._ + +**Local directory w/1M files, Empty S3 bucket** +![benchmarks](../uploads/images/2020-11-26/dvc_rclone_bench3.svg 'DVC 1.0 vs rclone performance comparison') +_Note: DVC 0.91 and rclone with `--no-traverse` both take multiple hours to +complete in this scenario and continue off of the chart._ + +In this example, we are testing a simple scenario in which the local directory +contains approximately 1 million files and the S3 bucket is empty. + +The difference in rclone runtime with or without `--no-traverse` in this +scenario shows the performance impact of selecting the optimal query method for +a given situation. + +This scenario also shows that rclone can outperform DVC with regard to +collecting the list of local files during certain types of sync operations. In +this case, rclone simply iterates over whatever files exist in the local +directory without doing any additional steps, since our benchmark uses a one-way +`rclone copy` operation. + +However, in DVC, we have some extra overhead for this step, since we collect the +list of files expected to be present in the current DVC repository revision, and +then verify that those files are present locally. We would then check to see if +any missing files are available to be downloaded from remote storage. + +It should also be noted that in common use cases where the number of files in +cloud storage continues to grow over time (such as in backup solutions or in +dataset versioning), rclone's advantage in this case would only apply for this +initial sync operation. Once the local dataset has been pushed to cloud storage, +DVC's advantage in synchronizing modifications to existing datasets would become +more apparent (as shown in the first example). + +## How DVC 1.0 speeds things up + +So I hope that by now you're curious about DVC, and are planning on using (or +maybe even already are using 😀) it to sync your files. For those who are +wondering where the magic actually happens, let's dive a bit deeper into how DVC +stores files, and how we were able to leverage that storage format to implement +query performance optimzations in DVC 1.0. (This will also be a useful primer +for anyone interested in learning about DVC internals in general.) + +Previously, we have established that: + +- Selecting the right query method will have a significant performance impact. +- Reducing the number of files to query will improve performance. + +In this section, we'll cover the ways in which DVC 1.0 has directly addressed +both of these key points: + +- Automatically selecting the optimal query method for any given sync operation. +- Indexing cloud storage remotes to eliminate the need to query for already + synchronized files. + +### DVC storage structure + +Before continuing, it will be helpful for the reader to understand a few things +about the DVC cache and remote storage structure. + +``` +. +├── 00 +│ ├── 411460f7c92d2124a67ea0f4cb5f85 +│ ├── 6f52e9102a8d3be2fe5614f42ba989 +│ └── ... +├── 01 +├── 02 +├── 03 +├── ... +└── ff +``` + +_Example DVC cache/remote structure_ + +- Files versioned by DVC are identified and stored in subdirectories according + to their [MD5](https://en.wikipedia.org/wiki/MD5) hash (i.e. + [content addressable storage](https://en.wikipedia.org/wiki/Content-addressable_storage)). +- MD5 is an + [evenly distributed](https://michiel.buddingh.eu/distribution-of-hash-values) + hash function, so the DVC cache (and DVC remote storage) will be evenly + distributed (i.e. given a large enough dataset, each remote subdirectory will + contain an approximately equal number of files) + +### How DVC 1.0 automatically selects a query method + +In DVC, the number of files we need to query is just the number of files for a +given project revision. So, as long as we can estimate the number of files in a +DVC remote, we can programmatically choose the optimal query method for a remote +operation. + +In DVC 1.0, we accomplish this by taking advantage of the DVC remote structure. +The over/under remote size threshold only depends on the number of files being +queried (i.e. the number of files in our DVC versioned dataset). And as we have +already established, a DVC remote will be evenly distributed. Therefore, if we +know the number of files contained in a subset of the remote, we can then +estimate the number of files contained in the entire remote. + +For example, if we know that the remote subdirectory `00/` contains 10 files, we +can estimate that the remote contains roughly `256 * 10 = 2,560` files in total. +So, by requesting a list of one subdirectory at a time (rather than the full +remote) via the cloud storage API, we can calculate a running estimate of the +total remote size. If the running estimated total size goes over the threshold +value, DVC will stop fetching the contains of the remote subdirectory, and +switch to querying each file in our dataset individually. If DVC reaches the end +of the subdirectory without the estimated size going over the threshold, it will +continue to fetch the full listing for the rest of the remote. + +By estimating remote size in DVC 1.0, we can ensure that we always use the +optimal method when querying remote status. + +### How DVC 1.0 uses indices to reduce the number of files to query + +A common DVC use case is +[versioning](https://dvc.org/doc/use-cases/versioning-data-and-model-files) the +contents of a large directory. As the contents of the directory changes over +time, DVC will be used to push each updated version of the directory into cloud +storage. In many cases, only a small number of files within that directory will +be modified between project iterations. + +So after the first version of a project is pushed into cloud storage, for +subsequent versions, only the small subset of changed files actually needs to be +synchronized with cloud storage. + +Consider a case where a user has an existing directory with 1 million files +which has been versioned and pushed to a remote with DVC. In the next iteration +of the project, only a single file in the directory has been modified. We can +obviously see that everything other than the one modified file will already +exist in cloud storage. Ideally, we should only need to query for the single +modified file. + +However, in DVC releases prior to 1.0, DVC would always need to query for every +file in the directory, regardless of whether or not a given file had changed +since the last time it was pushed to remote storage. + +But in DVC 1.0, we now keep an index of directories which have already been +versioned and pushed into remote storage. By referencing this index, DVC will +"remember" which files already exist in a remote, and will remove them from our +query set at the start of a data sync operation (before we choose a query +method, and before we make any cloud storage API requests). + +_Note: This optimization only applies to DVC versioned directories. Individually +versioned files (including those added with `dvc add -R`) are not indexed in DVC +1.0, and will always be queried during remote operations._ + +## Conclusion + +By utilizing a storage structure that allows for optimized status queries, DVC +makes data synchronization incredibly fast. Coupled with the ability to quickly +identify which files remain unchanged between sync operations, DVC 1.0 is a +powerful data management tool. + +Whether you are upgrading from a prior DVC release, or trying DVC for the first +time, we hope that all of our users are able to benefit from these new +optimizations. DVC performance is an important issue, and our team is looking +forward to working on further +[performance optimizations](https://github.com/iterative/dvc/labels/performance) +in the future - across all areas in DVC, not just remotes. + +As always, if you have any questions, comments or suggestions regarding DVC +performance, please feel free to connect with the DVC community on +[Discourse](https://discuss.dvc.org/), [Discord](https://dvc.org/chat) and +[GitHub](https://github.com/iterative/dvc). diff --git a/content/blogs/2020-12-18-december-20-dvc-heartbeat.md b/content/blogs/2020-12-18-december-20-dvc-heartbeat.md new file mode 100644 index 0000000000..c4dd7c611d --- /dev/null +++ b/content/blogs/2020-12-18-december-20-dvc-heartbeat.md @@ -0,0 +1,170 @@ +--- +title: December ’20 Heartbeat +date: 2020-12-18 +description: > + Monthly updates are here- read all about our brand new video docs, the DVC + Udemy course, open jobs with our team, and essential reading about Git-flow + with DVC. +descriptionLong: > + Monthly updates are here- read all about our brand new video docs, the DVC + Udemy course, open jobs with our team, and essential reading about Git-flow + with DVC. +picture: 2020-12-18/cover.png +pictureComment: | + This holiday season, show your loved ones + you care with our new shirt. + +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/december-20-heartbeat/585 +tags: + - Heartbeat + - CML + - DVC + - Udemy + - MLOps +--- + +## News + +Welcome to the December Heartbeat! Let's dive in with some news from the team. + +### We're still hiring + +Our search continues for two roles: + +- A + [**Senior Software Engineer**](https://weworkremotely.com/remote-jobs/iterative-senior-software-engineer-open-source-dev-tools-3) + for the core DVC team- someone with strong Python development skills who can + build and ship essential DVC features. + +- A + [**Developer Advocate**](https://weworkremotely.com/remote-jobs/iterative-developer-advocate) + to support and inspire developers by creating new content like blogs, + tutorials, and videos- plus lead outreach through meetups and conferences. + +Does this sound like you or someone you know? Be in touch! + +### Video docs complete! + +As you may have heard +[last month](https://dvc.org/blog/november-20-dvc-heartbeat), we've been working +on adding complete video docs to the "Getting Started" section of the DVC site. +We now have 100% coverage! We have videos that mirror the tutorials for: + +- [Data versioning](https://dvc.org/doc/start/data-and-model-versioning) - how + to use Git and DVC together to track different versions of a dataset + +- [Data access](https://dvc.org/doc/start/data-and-model-access) - how to share + models and datasets across projects and environments + +- [Pipelines](https://dvc.org/doc/start/data-pipelines) - how to create + reproducible pipelines to transform datasets to features to models + +- [Experiments](https://dvc.org/doc/start/experiments) - how to do a `git diff` + for models that compares and visualizes metrics + +![Mission Accomplished GIF by memecandy](https://media.giphy.com/media/L4ZZNbDpOCfiX8uYSd/giphy.gif) + +The +[full playlist is on our YouTube channel](https://www.youtube.com/playlist?list=PL7WG7YrwYcnDb0qdPl9-KEStsL-3oaEjg)- +where, by the way, we've recently passed 2,000 subscribers! Thanks so much for +your support. There's much more coming up soon. + +### Collaboration with GitLab + +We recently released a new blog with GitLab all about using [CML](cml.dev) with +GitLab CI. + +https://twitter.com/gitlab/status/1334631001956487171 + +You may notice that the tweet spelled our name differently, and since Twitter +doesn't have an edit button, I think that means we're "Interative" now. +[Hurry up and get your merch!](https://www.zazzle.com/t_shirt-235920696568133954) + +![](../uploads/images/2020-12-18/newname.png) + +### Workshops + +We gave a workshop at a virtual meetup held by the +[Toronto Machine Learning Society](https://mlopsworld.com/about-us/), and you +can catch a video recording if you missed it. This workshop was all about +getting started with GitHub Actions and CML! It starts with some high-level +overview and then gets into live-coding. + +https://youtu.be/51H13lfHdMw + +## From the community + +There's no shortage of cool things to report from the community: + +### The DVC Udemy Course + +Now you can learn the fundamentals of machine learning engineering, from +experiment tracking to data management to continuous integration, with DVC and +Udemy! Data scientists/DVC ambassadors +[Mikhail Rozhkov](https://www.udemy.com/user/mnrozhkov/) and +[Marcel Ribeiro-Dantas](https://www.udemy.com/user/marcel-da-camara-ribeiro-dantas/) +created a course full of +[practical tips and tricks for learners of all levels](https://www.udemy.com/course/machine-learning-experiments-and-engineering-with-dvc/?referralCode=68BEB2A7E246A54E5E35). + + + +### A proposal for Git-flow with DVC + +[Fabian Rabe](https://www.uni-augsburg.de/en/fakultaet/fai/informatik/prof/swtpvs/team/fabian-rabe/) +at [Universität Augsburg](https://www.uni-augsburg.de/en/) wrote a killer doc +about his team's tried-and-true approach to creating a workflow for a DVC +project. He writes, + +> Over the past couple of months we have started using DVC in our small team. +> With a handful of developers all coding, training models & committing in the +> same repository, we soon realized the need for a workflow. + +The post outlines three strategies his team adopted: + +1. Create a "debugging dataset" containing a subset of your data, with which you + can test your complete DVC pipeline locally on a developer's machine + +2. Use CI-Runners to execute the DVC pipeline on the full dataset + +3. Adopt a naming convention for Git branches that correspond to machine + learning experiments, in addition to the usual feature branches + +Agree? Disagree? Fabian is actively soliciting feedback on his proposal (and +possible solutions for some unresolved issues), so please read and +[chime in on our discussion board](https://discuss.dvc.org/t/git-flow-for-dvc/578/6). + + + +### Channel 9 talks Machine Learning and Python + +[The AI Show on Channel 9](https://channel9.msdn.com/Shows/AI-Show), part of the +Microsoft DevRel universe, put out an episode all about ML and scientific +computing with Python featuring [Tania Allard](https://twitter.com/ixek) and +[Seth Juarez](https://twitter.com/sethjuarez). Their episode includes how DVC +can fit in this development toolkit, so check it out! + + + +### A nice tweet + +We'll end on a tweet we love: + +https://twitter.com/iamjoyheron/status/1336698583689596929 + +This beautiful diagram, made by [Joy Heron](https://twitter.com/iamjoyheron) in +response to a talk by [Dr. Larysa Visengeriyeva](https://twitter.com/visenger) +about MLOps, is a wonderful encapsulation of the many considerations (at many +scales) that go into ML engineering. Do you see DVC in there? 🕵️ + +Thank you for reading, and happy holidays to you! ❄️ 🎁 ☃️ diff --git a/content/blogs/2020-12-30-december-20-community-gems.md b/content/blogs/2020-12-30-december-20-community-gems.md new file mode 100644 index 0000000000..7a34f5c178 --- /dev/null +++ b/content/blogs/2020-12-30-december-20-community-gems.md @@ -0,0 +1,196 @@ +--- +title: December '20 Community Gems +date: 2020-12-30 +description: > + A roundup of technical Q&A's from the DVC community. This month, read about + custom DVC plots, teamwork with DVC, CML without Docker, and maintaining + several pipelines in parallel! +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month, read about + custom DVC plots, teamwork with DVC, CML without Docker, and maintaining + several pipelines in parallel! +picture: 2020-12-30/cover.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/december-20-gems/606 +tags: + - Community Gems + - CML + - Plots + - Pipelines + - Docker +--- + +## DVC questions + +### [Q: Is there a way to plot all columns in a `.csv` file on a single graph using `dvc plot`?](https://discord.com/channels/485586884165107732/563406153334128681/768689062314770442) + +By default, `dvc plot` graphs one or two columns from the metric file of your +choice (use the `-x` and `-y` flags to specify which columns). + +However, there's nothing special about the way DVC makes plots. The plot +function is a wrapper for the [Vega-Lite](https://vega.github.io/vega-lite-v1/) +grammar, which can make pretty much any kind of plot you can imagine. If you +check inside `.dvc/plots/`, you'll see a few Vega-Lite template files- that's +where the plotting instructions are stored! + +You can create your own, or modify the existing templates, by +[following the instructions in our docs](https://dvc.org/doc/command-reference/plots#plot-templates). +In short, you'll create a new template and then run +`dvc plot show -t ` to use it! + +Vega-Lite has an +[interactive template editor online](https://vega.github.io/editor/#/), which +might help you test out ideas. Happy creating, and if you come up with a +template you'd like to share with the DVC community, +[consider opening a pull request!](https://github.com/iterative/dvc) + +### [Q: My teammate and I are having some issues keeping our workplaces synced. We're tracking some folders with DVC, and he recently added a new file to each of these folders. How does he update the tracked folder and push the new contents so I can access them, too?](https://discord.com/channels/485586884165107732/563406153334128681/785965719367843860) + +Your partner should first run + +```dvc +$ dvc add +$ dvc push +``` + +to update DVC about the new file and then push its contents to remote storage. +Next, they'll run: + +```dvc +$ git commit .dvc +$ git push +``` + +to update your shared Git repository. Then you can do a `git pull` and +`dvc pull` to sync the changes with your local workspace! + +### [Q: I forgot to declare a metric output in my `dvc.yaml` file, so one of my metrics is currently untracked. How can I fix this without rerunning the stage? It takes a long time to run.](https://discord.com/channels/485586884165107732/485596304961962003/781643749050155009) + +No problem- what you'll want to do is edit your `dvc.yaml` case and then run +`dvc commit dvc.yaml` to store the change. + +`dvc commit` is a helpful function that updates your `dvc.lock` file and `.dvc` +files as needed, which forces DVC to accept any modifications to tracked data +currently in your workspace. That should cover the case where you have a metric +file from your last pipeline run in your workspace, but forgot to add it to the +`dvc.yaml` as an output! + +[Check out the docs](https://dvc.org/doc/command-reference/commit#commit) for +more about `dvc commit` and how it can help you edit pipeline dependencies as +you work. + +### [Q: Can I have multiple `dvc.yaml` files?](https://discord.com/channels/485586884165107732/485596304961962003/784083794583486496) + +Yes. The catch is that they have to be in separate directories. For example, you +can define independent pipelines in a `dvc.yaml` file each. It's also possible +to spread a single pipeline into more than one `dvc.yaml` file. DVC analyzes all +of them to rebuild the DAG(s), for example during `dvc repro`. + +### [Q: I want to work on my DVC pipeline on a different computer than usual. For the stage I'm developing, I don't need access to all the data dependencies of the earlier stages- is there a way to download only what I need?](https://discord.com/channels/485586884165107732/563406153334128681/788068487246512158) + +Say for example that you have a pipeline like this: + +``` ++----------+ +| data.dvc | ++----------+ + * + * + * + +----+ + | s1 | + +----+ + * + * + * + +----+ + | s2 | + +----+ + * + * + * + +----+ + | s3 | + +----+ +``` + +where stage `s2` is frozen (meaning, its dependencies will not change and we can +be reasonably sure the outputs of `s2` are static). + +To work on stage `s3` in a new workspace, you could run: + +```dvc +$ dvc pull s2 +$ dvc repro s3 +``` + +This set of commands will pull only the targeted stage (not the data +corresponding to `data.dvc`), and then execute the final stage of your pipeline +only. + +## CML questions + +### [Q: Why do you need Docker to run CML?](https://www.youtube.com/watch?v=rVq-SCNyxVc&lc=UgzohiMVxO1GKB30bad4AaABAg) + +Even though we use Docker in many of our tutorials, you technically _don't_ need +it at all! Here's what's going on: + +We use a custom Docker container that comes with the CML functions installed (as +well as some useful data science tools like Python, Vega-Lite, and CUDA +drivers). If you want to use your own Docker container, that's fine too- just +make sure you install the CML library of functions on your runner. + +To install CML as an `npm` package on your runner, we recommend: + +```dvc +npm i -g @dvcorg/cml +``` + +Once this is done, you should be able to execute functions like `cml publish` +and `cml send-comment` on your runner. + +For more tips about using CML without Docker, +[see our docs](https://github.com/iterative/cml#install-cml-as-a-package). + +### [Q: I'm using CML to print a `dvc metrics diff` to my pull request in GitHub, but I'm getting an error: `token not found`. What does that mean?](https://discord.com/channels/485586884165107732/728693131557732403/786382971706933258) + +Generally, `token` refers to an authorization token that grants your runner +certain permissions with the GitHub API- such as the ability to post a comment +on your pull request. If you're working in GitHub, you don't have to follow any +manual steps to create a token. But you _do_ need to make sure your +environmental variables in the workflow are named properly. + +Make sure you've specified the following field in your workflow file: + +```yaml +env: + repo_token: ${{ secrets.GITHUB_TOKEN }} +``` + +The variable must be called `repo_token` for CML to recognize it! + +A few other pointers: + +- In GitLab, you have to set a variable in your repository called `repo_token` + whose value is Personal Access token. We have + [step-by-step instructions in our docs](https://github.com/iterative/cml/wiki/CML-with-GitLab#variables). + Forgetting to set this is the #1 issue we see with first-time GitLab CI users! +- In BitBucket Cloud, you need to set a variable in your repository called + `repo_token` whose value is your API credentials. We have + [detailed docs for creating this token](https://github.com/iterative/cml/wiki/CML-with-Bitbucket-Cloud#repository-variables), + too. +- Need to see more sample workflows to get a feel for it? We have plenty + [of case studies](https://dvc.org/doc/cml#case-studies) to examine. + +### [Q: Is there any reason why an experimental DVC feature wouldn't work on the CML Docker container?](https://discord.com/channels/485586884165107732/728693131557732403/788512890394247178) + +Generally, no- the container `dvcorg/cml:latest` should have the latest DVC +release and the latest CML release (you can see where DVC and CML are installed +from in our +[Dockerfile](https://github.com/iterative/cml/blob/master/Dockerfile)). So +besides the time it takes for releases to be published on various package +managers, there shouldn't be any lag. That means experimental features are ready +to play on your runner! + +Note that you can also install pre-release versions of DVC- check out our +[docs about installing the latest stable version ahead of official releases](https://dvc.org/doc/install/pre-release). diff --git a/content/blogs/2021-01-20-january-21-dvc-heartbeat.md b/content/blogs/2021-01-20-january-21-dvc-heartbeat.md new file mode 100644 index 0000000000..5267801455 --- /dev/null +++ b/content/blogs/2021-01-20-january-21-dvc-heartbeat.md @@ -0,0 +1,162 @@ +--- +title: January ’21 Heartbeat +date: 2021-01-20 +description: > + Monthly updates are here! read all about our new R language tutorial, putting + DVC to work on an image segmentation pipeline, and a new fast way to setup + your DVC remote. +descriptionLong: > + Monthly updates are here! read all about our new R language tutorial, putting + DVC to work on an image segmentation pipeline, and a new fast way to setup + your DVC remote. +picture: 2021-01-20/cover.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/dvc-heartbeat-jan-21/632 +tags: + - Heartbeat + - CML + - DVC + - DAGsHub + - R + - MLOps +--- + +## News + +Welcome to the first Heartbeat of 2021! Here's some new year news. + +### We're still hiring + +Our search continues for a +[**Developer Advocate**](https://weworkremotely.com/remote-jobs/iterative-developer-advocate) +to support and inspire developers by creating new content like blogs, tutorials, +and videos- plus lead outreach through meetups and conferences. + +Does this sound like you or someone you know? Be in touch! + +### 7000 stars on GitHub + +We recently passed 7000 stars on the +[DVC GitHub repository](https://github.com/iterative/dvc)! We crossed the 7k +mark extremely close to midnight on New Year's Eve, so we probably hit it in +time for the new year in at least one time zone. Anyway, it made for a very +suspenseful countdown to midnight. Woot woot! + +![Make Countdown GIF](https://media.giphy.com/media/QAPFLCrpfalPi/giphy.gif) + +The repo is HQ for DVC development, meaning- if you have an issue to report, a +feature to request, or a pull request to offer, this is where you should start! + +### New video for R users + +A lot of our videos about GitHub Actions have used Python scripts, but there's +no reason to restrict [Continuous Machine Learning](https://cml.dev) to one +language. We've just released our first-ever R language video, which covers + +- How to install R on a GitHub Actions runner +- How to manage R package dependencies for continuous integration (teaser: CRAN + binaries are amazing) +- Putting a `ggplot` or a `kable` table in your pull request + +Watch and follow along! If you make something based on this approach, or if you +think there's a better way, please tell us- we're eager to see what the R +community thinks. + +https://youtu.be/NwUijrm2U2w + +### Workshops and talks + +On Friday, January 24, I (Elle) spoke with +[Alexey Grigorev](https://twitter.com/Al_Grigor) (author of a +[Data Science Bookcamp](https://mlbookcamp.com/)), on his podcast about being a +developer advocate in the machine learning space! If you're curious about what +the role entails, or what to look for when hiring a developer advocate for your +machine learning project, please come by. The event is up on YouTube, and will +soon be available as a podcast for your listening pleasure 🎧 + +https://youtu.be/jv5W4jXk4P4 + +## From the community + +As ever, we have much to share from the great citizens of the DVC community. + +### Where's Baby Yoda? + +There's a brand new blog post we love, and only half of that has to do with its +impressive collection of Baby Yoda pics. +[Simon Lousky](https://dagshub.com/blog/author/simon/), developer at +[DAGsHub](https://dagshub.com), published a blog provocatively titled +[_Datasets should behave like git repositories_](https://dagshub.com/blog/datasets-should-behave-like-git-repositories/). +He writes: + +> While data versioning solves the problem of managing data in the context of +> your machine learning project, it brings with it a new approach to managing +> datasets. This approach, also described as data registries here, consists of +> creating a git repository entirely dedicated to managing a dataset. This means +> that instead of training models on frozen datasets - something researchers, +> students, kagglers, and open source machine learning contributors often do - +> you could link your project to a dataset (or to any file for that matter), and +> treat it as a dependency. After all, data can and should be treated as code, +> and follow through a review process. + +We agree! Lousky goes on to show us a brilliant code example wherein he segments +instances of Baby Yoda out of frames from The Mandalorian. DVC plays a key role +in keeping track of all the Baby Yodas, which is pretty much the most important +use case we could've imagined. + +![](../uploads/images/2021-01-20/bb_yoda.png)_Found them!_ + +There's also a +[lively discussion about the post on Reddit](https://www.reddit.com/r/MachineLearning/comments/l0l0oc/p_datasets_should_behave_like_git_repositories/). +Check it out and consider contributing your own Baby Yoda image annotations to +grow the dataset! + +### Data Version Control Explained + +Researcher [Nimra Ejaz](https://blog.crowdbotics.com/author/nimra/) published a +fantastically detailed introduction to DVC. She even included a "History of DVC" +section, which is pretty cool for us- this might be a first! + +Her blog covers not only the key features of DVC, but a thoughtful pros-and-cons +list _and_ a case study about using DVC in an image classification project. If +you want an up-to-date, high-level overview of DVC and some help deciding if it +fits your needs, I couldn't recommend Nimra's blog more. + + + +### One more thing from DAGsHub + +[Dean Pleban](https://twitter.com/DeanPlbn), CEO of DAGsHub, shared an important +update: they now offer FREE dataset and model hosting for DVC projects (up to 10 +GB per user and project, with flexibility for public projects)! And with no +configuration! + +That means you don't have to configure your DVC remote to use DVC with model and +data storage in the cloud- DAGsHub will handle _all_ of it. Your DVC remote can +be added as easily as a Git remote, in other words. Read the announcement, and +then dig into their +[basic tutorial](https://dagshub.com/docs/experiment-tutorial/overview/) to get +started. + + + +### A nice tweet + +[Bilgin Ibryam](https://twitter.com/bibryam), author of the +[Kubernetes Patterns](https://www.redhat.com/en/engage/kubernetes-containers-architecture-s-201910240918) +book, gave us a shoutout for being an interesting data engineering project +(according to a list by another expert we trust, +[Dmitry Ryabov](https://twitter.com/squarecog)). Thanks Bilgin and Dmitry, we +think you're very interesting too! + +https://twitter.com/bibryam/status/1341777034448650242 diff --git a/content/blogs/2021-01-26-january-21-community-gems.md b/content/blogs/2021-01-26-january-21-community-gems.md new file mode 100644 index 0000000000..5da60d717b --- /dev/null +++ b/content/blogs/2021-01-26-january-21-community-gems.md @@ -0,0 +1,180 @@ +--- +title: January '21 Community Gems +date: 2021-01-26 +description: > + A roundup of technical Q&A's from the DVC community. This month: parallelize + your data transfer, compressed datasets, and DVC pipelines in CI/CD. +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month: parallelize + your data transfer, compressed datasets, and DVC pipelines in CI/CD. +picture: 2021-01-28/gems-cover.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/january-21-community-gems/645 +tags: + - Community Gems + - CML + - Plots + - Pipelines + - Docker +--- + +## DVC questions + +### [Q: Is there an equivalent of `git restore ` for DVC?](https://discord.com/channels/485586884165107732/563406153334128681/799598181310267392) + +Yes! You'll want `dvc checkout`. It restores the corresponding verion of your +DVC-tracked file or directory from +[the cache](https://dvc.org/doc/user-guide/dvc-internals#structure-of-the-cache-directory) +to your local workspace. +[Read up in our docs for more info!](https://dvc.org/doc/command-reference/checkout#checkout) + +### [Q: My dataset is made of more than _a million_ small files. Can I use an archive format, like `tar.gz` with DVC?](https://discord.com/channels/485586884165107732/485596304961962003/798983422965841920) + +There are some downsides to using archive formats, and often we discourage it- +but let's review some factors to consider, so you can make the best choice for +your project. + +- If your `tar.gz` file changes at all- perhaps because you changed a single + file before zipping- you'll end up with an entirely new copy of the archive + every time you commit! This is not very space efficient, but if space isn't an + issue it might not be a dealbreaker. +- Because of the way we optimize data transfer, you'll end up transferring the + whole archive anytime you modify a single file and `dvc push`/`dvc pull`. +- In general, archives don't play nice with the concept of diffs. Looking back + at your git history, it can be challenging to log how files were deleted, + modified, or added when you're versioning archives. + +While we can't do much about the general issues that archives present for +version control systems, DVC does have some options that might help you achieve +better data transfer speeds. We recommend exploring DVC's built-in parallelism- +data transfer functions like `dvc push` and `dvc pull` have a flag (`-j`) for +increasing the number of jobs run simultaneously. +[Check out the docs for more details](https://dvc.org/doc/command-reference/push#options). + +In summary, the advantage of using an archive format will depend on both how +often you modify your dataset and how often you need to push and pull data. You +might consider exploring both approaches (with and without compression) and run +some speed tests for your use case. We'd love to know what you find! + +### [Q: My DVC remote is a server with a self-signed certificate. When I push data, DVC is giving me an SSL verification error- how can I get around this?](https://discord.com/channels/485586884165107732/563406153334128681/800707271502856222) + +On S3 or S3-compatible storage, you can configure your AWS CLI to use a custom +certificate path. +[As suggested by their docs](https://docs.aws.amazon.com/credref/latest/refdocs/setting-global-ca_bundle.html), +you can also set the environment variable `AWS_CA_BUNDLE` to your `.pem` file. + +Similarly, on HTTP and Webdav remotes, there's `REQUESTS_CA_BUNDLE` environment +variable that you can set your self-signed certificate file to. + +Then, when DVC tries to access your storage, you should be able to get past SSL +verification! + +### [Q: I want to be able to make my own plots in Python with data points from my `dvc plots`, including older versions of those plots. What do you recommend to get the raw historical data?](https://discord.com/channels/485586884165107732/563406153334128681/799617584336338954) + +We suggest + +```python +from git import Repo + +revs = Repo().plots.collect(revs=revs) +``` + +Then you can plot the data contained in `revs` to your heart's content! + +### [Q: Is it safe to share a DVC remote between two projects or registries?](https://discord.com/channels/485586884165107732/563406153334128681/799216349405904896) + +You can share a remote with as many projects as you like. Because DVC uses +content-addressible storage, you'll still get benefits like file deduplication +over every project that uses the remote. This can be useful if you're likely to +have many shared files across projects. + +One big thing to watch out for: you have to be very careful with clearing the +DVC cache. Make sure you don't remove files associated with another project when +running `dvc gc` by using the `--projects` flag. +[Read up in the docs!](https://dvc.org/doc/command-reference/gc#options) + +### [Q: Can I throttle the number of simultaneous uploads to remote storage with DVC?](https://discord.com/channels/485586884165107732/563406153334128681/802099863076208662) + +Yep! That'll be the `-j/--jobs` flag, for example: + +```dvc +$ dvc push -j +``` + +will control the number of simultaneous uploads DVC attempts when pushing files +to your remote storage +([see more in our docs](https://dvc.org/doc/command-reference/push#push)). + +## CML questions + +### [Q: I have a DVC pipeline that I want to run in CI/CD. Specifically, I only want to reproduce the stages that have changed since my last commit. What do I do?](https://discord.com/channels/485586884165107732/728693131557732403/796185815574511616) + +DVC pipelines, like makefiles, will only reproduce stages that DVC detects have +changed since the last commit. So to do this in CI/CD systems like GitHub +Actions or GitLab CI, you'll want to make sure the workflow a) syncs the runner +with the latest version of your pipeline, including all inputs and dependencies, +and b) reruns your DVC pipeline. + +In practice, your workflow needs to include these two commands: + +```dvc +$ dvc pull +$ dvc repro +``` + +You pull the latest version of your pipeline, inputs and dependencies from cloud +storage with `dvc pull`, and then `dvc repro` intelligently reproduces the +pipeline (meaning, it should avoid rerunning stages that haven't changed since +the last commit). + +Check out an +[example workflow here](https://github.com/iterative/cml_dvc_case/blob/master/.github/workflows/cml.yaml). + +### [Q: I'm using DVC and CML to pull data from cloud storage, then train a model. I want to push the trained model into cloud storage when I'm done, what should I do?](https://discord.com/channels/485586884165107732/728693131557732403/801553810618187796) + +One approach is to run + +```dvc +$ dvc add +$ dvc push +``` + +to the end of your workflow. This will push the model file, but there's a +downside: it won't keep a strong link between the pipeline (meaning, the command +you used to generate the model and any code/data dependencies) and the model +file. + +What we recommend is that you create a +[DVC pipeline](https://dvc.org/doc/start/data-pipelines#get-started-data-pipelines) +with one stage- training your model- and declaring your model file as an output. +Then, your workflow can look like this: + +```dvc +# get data +$ dvc pull --run-cache + +# run the pipeline +$ dvc repro + +# push to remote storage +$ dvc push --run-cache +``` + +When you do this workflow with the `--run-cache` flags, you'll be able to save +all the results of the pipeline in the cloud +([read more here](https://dvc.org/doc/command-reference/push#options)). When the +run has completed, you can go to your local workspace and run: + +```dvc +$ dvc pull --run-cache +$ dvc repro +``` + +This will put your model in your local workspace! And, you get an immutable link +between the code version, data version and model you end up with. + +We recommend this approach so you don't lose track of how model files relate to +the data and code that produced them. It's a little more work to set up, but +Future You will thank you! + +![Tim Robinson Reaction GIF by The Lonely Island](https://media.giphy.com/media/l0LEIXSRRuv9QQIRNI/giphy.gif) diff --git a/content/blogs/2021-02-16-february-21-dvc-heartbeat.md b/content/blogs/2021-02-16-february-21-dvc-heartbeat.md new file mode 100644 index 0000000000..295165733c --- /dev/null +++ b/content/blogs/2021-02-16-february-21-dvc-heartbeat.md @@ -0,0 +1,177 @@ +--- +title: February ’21 Heartbeat +date: 2021-02-16 +description: > + Monthly updates are here! Read all about our growing team, our CEO's interview + on The New Stack, integration with spaCy and more! +descriptionLong: > + Monthly updates are here! Read all about our growing team, our CEO's interview + on The New Stack, integration with spaCy and more! +picture: 2021-02-16/feb21cover.png +author: jeny_defigueiredo +commentsUrl: https://discuss.dvc.org/t/february-21-heartbeat/669 +tags: + - Heartbeat + - CML + - DVC + - DAGsHub + - spaCy + - ML Summit 2021 + - Spell + - MLOps +--- + +## News + +Happy February! Here's all the news to keep you up to date. + +## We've hired and are still hiring! + +We have four new team members this month! + +[**Dave Berenbaum**](https://www.linkedin.com/in/david-berenbaum-20b6b424/) came +to Iterative.ai by way of a +[previous contribution](https://github.com/iterative/dvc/pull/2107) to our open +source products while working as a Data Science Manager at Captial One. He joins +the team as a Technical Product Manager. We are thrilled he's here! + +[**Batuhan Taskaya**](https://www.linkedin.com/in/batuhan-osman-taskaya-7803b61a0/) +joins us as a DVC Software Engineer working on the Python core. Batuhan is +excited to work on open source full time and we are excited to have him do so! + +[**Jeny De Figueiredo**](https://www.linkedin.com/in/jenifer-de-figueiredo/) is +involved in the Seattle area data science community at Data Circles and is a +WiDS Puget Sound Ambassador. She joins us as our new Community Manager and is +looking forward to further building and engaging the community in MLOps! (Hi! +This is me. 🙋🏻‍♀️ I'll be writing Heartbeat!) + +[**Roger Parent**](https://www.linkedin.com/in/rogermparent/) has already been a +big part of building DVC and [CML](https://cml.dev/). He has been a primary +developer of a UI that interfaces with the DVC Python application to provide an +interface with the Experiments feature that's coming out with DVC 2.0. We are so +excited to have him joining us full time as Software Engineer. + +![Search](https://media.giphy.com/media/vAvWgk3NCFXTa/giphy.gif) + +## Open Positions + +We are on the hunt for a +[TypeScript Front-End Engineer](https://docs.google.com/document/d/1aT5HZYt4kAUxXqD4JNTe3jPDlVUwSmnEWDPR2QoKdvo/edit) +to build SaaS and a VS Code UI for our popular machine learning tools: DVC and +CML. The ML tools ecosystem is what JS space was 10 years ago. Come join us on +this exciting project! + +Our search continues for a +[Developer Advocate](https://weworkremotely.com/remote-jobs/iterative-developer-advocate) +to support and inspire developers by creating new content like blogs, tutorials, +and videos - plus lead outreach through meetups and conferences. + +Does this sound like you or someone you know? Be in touch! + +## Iterative.ai Featured on The New Stack + +[Susan Hall](https://thenewstack.io/author/susanhall/) of +[The New Stack.io](https://thenewstack.io/) interviewed our very own CEO, +[Dmitry Petrov](https://twitter.com/fullstackml), discussing the needs of ML +engineers and how Iterative.ai makes tools to enable version control and CI/CD +for versioning data and ML models. + +> "ML engineers, they still need collaboration. They need GitHub for +> collaboration, they need this CI/CD system to resolve [issues] between each +> other, between the team and productions system." - Dmitry Petrov + + + +## Workshops and Talks + +### Developer Advocacy for Data Science + +So you saw the post further up. 👆🏽 Curious about developer advocacy or what to +look for in a hire for this position? +[Elle O'Brien](https://twitter.com/drelleobrien) dove into this recently with +[Alexey Grigorev](https://twitter.com/Al_Grigor) (author of a +[Data Science Bookcamp](https://mlbookcamp.com/)) +[in this podcast](https://www.youtube.com/watch?v=jv5W4jXk4P4) on +[DataTalks.club](http://datatalks.club/) You can watch it here below. 👇🏼 + +https://www.youtube.com/watch?v=jv5W4jXk4P4 + +## From the Community + +As ever, we have much to share from the great citizens of the DVC community. + +### spaCy and DVC Integration + +If your NLP team uses spaCy to manage your projects, with spaCy's release of +v3.0, you can now enjoy DVC integration to manage your workflow like Git! Check +out the [documentation here](https://spacy.io/usage/projects#integrations) to +streamline and track your process! 🏆 + + + +### DagsHub and DVC Integrations + +This month two great articles came out regarding the integration of DAGsHub and +DVC. First, this article: [Datasets Should Behave Like Git Repo walks you +through the steps to use DVC in your data versioning. The following image shows +the dependencies and how you simply need to do a `dvc update` each time your +dataset or model changes to track the process. + + + +### Did you say "Works Out of the Box?" + +Also from DAGsHub, by CEO [Dean Pleban](https://twitter.com/DeanPlbn), +[Free Dataset & Model Hosting with Zero Configuration - Launching DAGsHub Storage](https://dagshub.com/blog/dagshub-storage-zero-configuration-dataset-model-hosting/) +tells how their new DAGsHub storage is a DVC remote that requires zero +configuration (!) and will allow for team and organization access controls as +well as easy visibility. + +![Friends](https://media.giphy.com/media/Ftz07proVX6Rq/giphy.gif) + +### Model Management and ML Workflow Orchestration with DVC and Apache Airflow 🇩🇪 ❗️ + +We're really excited about a German language workshop led by +[Matthias Niehoff](https://twitter.com/matthiasniehoff)! The workshop will be a +part of the ML Summit 2021 taking place April 19-21st, but registration closes +February 18th. So time is ticking. ⏰ The Conference is online, but will be in +German. For more info, head here 👉🏽 for the +[Workshop Details](https://ml-summit.de/machine-learing/modellmanagement-und-ml-workflow-orchestrierung-mit-dvc-und-apache-airflow/). + +### "_The_ most popular 'N+1' tool used by teams on Spell" + +[Using DVC as a Lightweight Feature Store on Spell](https://spell.ml/blog/using-dvc-with-spell-YBHOChEAACgAaSmV) +by [Aleksey Bilogur](https://twitter.com/ResidentMario) , reviews the process of +using DVC with Spell for managing changing datasets, enabling team-wide data +reproducibility and why Spell fans are DVC fans, and vice versa. 🔄 + +![Fans](https://media.giphy.com/media/GM8PrUsm92hRC/giphy.gif) + +## Tweet Love ❤️ + +https://twitter.com/mihail_eric/status/1357014486377324547?s=20 + +You're all caught up! See you at the next Community Gems 💎! + +--- + +_Do you have any use case questions or need support? Join us in +[Discord](https://discord.com/invite/dvwXA2N)!_ + +_Head to the [DVC Forum](https://discuss.dvc.org/) to discuss your ideas and +best practices._ diff --git a/content/blogs/2021-02-18-dvc-2-0-pre-release.md b/content/blogs/2021-02-18-dvc-2-0-pre-release.md new file mode 100644 index 0000000000..2752f229f1 --- /dev/null +++ b/content/blogs/2021-02-18-dvc-2-0-pre-release.md @@ -0,0 +1,574 @@ +--- +title: DVC 2.0 Pre-Release +date: 2021-02-17 +description: > + Today, we're announcing DVC 2.0 pre-release. We'll share lessons from our + journey and how these will be reflected in the coming release. +descriptionLong: > + The new release is a result of our learning from our users. There are four + major features coming: + + 🔗 ML pipeline templating and iterative foreach stages + + 🧪 Lightweight ML experiments + + 📍 ML model checkpoints + + 📈 Dvc-live - new open-source library for metrics logging +picture: 2021-02-18/dvc-2-0-pre-release.png +pictureComment: DVC 2.0 Pre-Release +author: dmitry_petrov +commentsUrl: https://discuss.dvc.org/t/dvc-2-0-pre-release/681 +tags: + - Release + - MLOps + - DataOps +--- + +## Install + +First things first. You can install the 2.0 pre-release from the master branch +in our repo (instruction [here](https://dvc.org/doc/install/pre-release)) or +through pip: + +```dvc +$ pip install --upgrade --pre dvc +``` + +## ML pipelines parameterization and foreach stages + +After introducing the multi-stage pipeline file `dvc.yaml`, it was quickly +adopted among our users. The DVC team got tons of positive feedback from them, +as well as feature requests. + +### Pipeline parameters from `vars` + +The most requested feature was the ability to use parameters in `dvc.yaml`. For +example. So, you can pass the same seed value or filename to multiple stages in +the pipeline. + +```yaml +vars: + train_matrix: train.pkl + test_matrix: test.pkl + seed: 20210215 + +... + +stages: + process: + cmd: python process.py \ + --seed ${seed} \ + --train ${train_matrix} \ + --test ${test_matrix} + outs: + - ${test_matrix} + - ${train_matrix} + + ... + + train: + cmd: python train.py ${train_matrix} --seed ${seed} + deps: + - ${train_matrix} +``` + +Also, it gives an ability to localize all the important parameters in a single +`vars` block and play with them. This is a natural thing to do for scenarios +like NLP or when hyperparameter optimization is happening not only in the model +training code but in the data processing as well. + +### Pipeline parameters from params files + +It is quite common to define pipeline parameters in a config file or a +parameters file (like `params.yaml`) instead of in the pipeline file `dvc.yaml` +itself. These parameters defined in `params.yaml` can also be used in +`dvc.yaml`. + +```yaml +# params.yaml +models: + us: + thresh: 10 + filename: 'model-us.hdf5' +``` + +```yaml +# dvc.yaml +stages: + build-us: + cmd: >- + python script.py + --out ${models.us.filename} + --thresh ${models.us.thresh} + outs: + - ${models.us.filename} +``` + +DVC properly tracks params dependencies for each stage starting from the +previous DVC version 1.0. See the +[`--params` option](https://dvc.org/doc/command-reference/run#for-displaying-and-comparing-data-science-experiments) +of `dvc run` for more details. + +### Iterating over params with foreach stages + +Iterating over params was a frequently requested feature. Now users can define +multiple similar stages with a templatized command. + +```yaml +stages: + build: + foreach: + gb: + thresh: 15 + filename: 'model-gb.hdf5' + us: + thresh: 10 + filename: 'model-us.hdf5' + do: + cmd: >- + python script.py --out ${item.filename} --thresh ${item.thresh} + outs: + - ${item.filename} +``` + +## Lightweight ML experiments + +DVC uses Git versioning as the basis for ML experiments. This solid foundation +makes each experiment reproducible and accessible from the project's history. +This Git-based approach works very well for ML projects with mature models when +only a few new experiments per day are run. + +However, in more active development, when dozens or hundreds of experiments need +to be run in a single day, Git creates overhead — each experiment run requires +additional Git commands `git add/commit`, and comparing all experiments is +difficult. + +We introduce lightweight experiments in DVC 2.0! This is how you can auto-track +ML experiments without any overhead from ML engineers. + +⚠️ Note, our new ML experiment features (`dvc exp`) are experimental in the +coming release. This means that the commands might change a bit in the following +minor releases. + +`dvc exp run` can run an ML experiment with a new hyperparameter from +`params.yaml` while `dvc exp diff` shows metrics and params difference: + +```dvc +$ dvc exp run --set-param featurize.max_features=3000 + +Reproduced experiment(s): exp-bb55c +Experiment results have been applied to your workspace. + +$ dvc exp diff +Path Metric Value Change +scores.json auc 0.57462 0.0072197 + +Path Param Value Change +params.yaml featurize.max_features 3000 1500 +``` + +More experiments: + +```dvc +$ dvc exp run --set-param featurize.max_features=4000 +Reproduced experiment(s): exp-9bf22 +Experiment results have been applied to your workspace. + +$ dvc exp run --set-param featurize.max_features=5000 +Reproduced experiment(s): exp-63ee0 +Experiment results have been applied to your workspace. + +$ dvc exp run --set-param featurize.max_features=5000 \ + --set-param featurize.ngrams=3 +Reproduced experiment(s): exp-80655 +Experiment results have been applied to your workspace. +``` + +In the examples above, hyperparameters were changed with the `--set-param` +option, but you can make these changes by modifying the params file instead. In +fact _any code or data files can be changed_ and `dvc exp run` will capture the +variations. + +See all the runs: + +```dvc +$ dvc exp show --no-pager --no-timestamp \ + --include-params featurize.max_features,featurize.ngrams +``` + +```dvctable + ───────────────────────────────────────────────────────────────────── + **Experiment** **auc** **featurize.max_features** **featurize.ngrams** + ───────────────────────────────────────────────────────────────────── + workspace 0.56359 5000 3 + master 0.5674 1500 2 + ├── exp-80655 0.56359 5000 3 + ├── exp-63ee0 0.5515 5000 2 + ├── exp-9bf22 0.56448 4000 2 + └── exp-bb55c 0.57462 3000 2 + ───────────────────────────────────────────────────────────────────── +``` + +Under the hood, DVC uses Git to store the experiments' meta-information. A +straight-forward implementation would create visible branches and auto-commit in +them, but that approach would over-pollute the branch namespace very quickly. To +avoid this issue, we introduced custom Git references `exps`, the same way as +GitHub uses custom references `pulls` to track pull requests (this is an +interesting technical topic that deserves a separate blog post). Below you can +see how it works. + +No artificial branches, only custom references `exps` (do not worry if you don't +understand this part - it is an implementation detail): + +```dvc +$ git branch +* master + +$ git show-ref +5649f62d845fdc29e28ea6f7672dd729d3946940 refs/exps/exec/EXEC_APPLY +5649f62d845fdc29e28ea6f7672dd729d3946940 refs/exps/exec/EXEC_BRANCH +5649f62d845fdc29e28ea6f7672dd729d3946940 refs/exps/71/67904d89e116f28daf7a6e4c0878268117c893/exp-80655 +f16e7b7c804cf52d91d1d11850c15963fb2a8d7b refs/exps/97/d69af70c6fb4bc59aefb9a87437dcd28b3bde4/exp-63ee0 +0566d42cddb3a8c4eb533f31027f0febccbbc2dd refs/exps/91/94265d5acd847e1c439dd859aa74b1fc3d73ad/exp-bb55c +9bb067559583990a8c5d499d7435c35a7c9417b7 refs/exps/49/5c835cd36772123e82e812d96eabcce320f7ec/exp-9bf22 +``` + +The best experiment can be promoted to the workspace and committed to Git. + +```dvc +$ dvc exp apply exp-bb55c +$ git add . +$ git commit -m 'optimize max feature size' +``` + +Alternatively, an experiment can be promoted to a branch (`big_fr_size` branch +in this case): + +```dvc +$ dvc exp branch exp-80655 big_fr_size +Git branch 'big_fr_size' has been created from experiment 'exp-c695f'. +To switch to the new branch run: + + git checkout big_fr_size +``` + +Remove all the experiments that were not used: + +```dvc +$ dvc exp gc --workspace --force +``` + +## Model checkpoints + +ML model checkpoints are an essential part of deep learning. ML engineers prefer +to save the model files (or weights) at checkpoints during a training process +and return back when metrics start diverging or learning is not fast enough. + +The checkpoints create a different dynamic around ML modeling process and need a +special support from the toolset: + +1. Track and save model checkpoints (DVC outputs) periodically, not only the + final result or training epoch. +2. Save metrics corresponding to each of the checkpoints. +3. Reuse checkpoints - warm-start training with an existing model file, + corresponding code, dataset version and metrics. + +This new behavior is supported in DVC 2.0. Now, DVC can version all your +checkpoints with corresponding code and data. It brings the reproducibility of +DL processes to the next level - every checkpoint is reproducible. + +This is how you define checkpoints with live-metrics: + +```dvc +$ dvc stage add -n train \ + -d users.csv -d train.py \ + -p dropout,epochs,lr,process \ + --checkpoint model.h5 \ + --live logs \ + python train.py + +Creating 'dvc.yaml' +Adding stage 'train' in 'dvc.yaml' +``` + +Note, we use `dvc stage add` command instead of `dvc run`. Starting from DVC 2.0 +we begin extracting all stage specific functionality under `dvc stage` umbrella. +`dvc run` is still working, but will be deprecated in the following major DVC +version (most likely in 3.0). + +Start the training process and interrupt it after 5 epochs: + +```dvc +$ dvc exp run +'users.csv.dvc' didn't change, skipping +Running stage 'train': +> python train.py +... +^CTraceback (most recent call last): +... +KeyboardInterrupt +``` + +Navigate in checkpoints: + +```dvc +$ dvc exp show --no-pager --no-timestamp +``` + +```dvctable + ────────────────────────────────────────────────────────────────────── + **Experiment** **step** **loss** **accuracy** **val_loss** **…** **epochs** **…** + ────────────────────────────────────────────────────────────────────── + workspace 4 2.0702 0.30388 2.025 … 5 … + master - - - - … 5 … + │ ╓ exp-e15bc 4 2.0702 0.30388 2.025 … 5 … + │ ╟ 5ea8327 4 2.0702 0.30388 2.025 … 5 … + │ ╟ bc0cf02 3 2.1338 0.23988 2.0883 … 5 … + │ ╟ f8cf03f 2 2.1989 0.17932 2.1542 … 5 … + │ ╟ 7575a44 1 2.2694 0.12833 2.223 … 5 … + ├─╨ a72c526 0 2.3416 0.0959 2.2955 … 5 … + ────────────────────────────────────────────────────────────────────── +``` + +Each of the checkpoints above is a separate experiment with all data, code, +paramaters and metrics. You can use the same `dvc exp apply` command to extract +any of these. + +Another run continues this process. You can see how accuracy metrics are +increasing - DVC does not remove the model/checkpoint and training code trains +on top of it: + +```dvc +$ dvc exp run +Existing checkpoint experiment 'exp-e15bc' will be resumed +... +^C +KeyboardInterrupt + +$ dvc exp show --no-pager --no-timestamp +``` + +```dvctable + ────────────────────────────────────────────────────────────────────── + **Experiment** **step** **loss** **accuracy** **val_loss** **…** **epochs** **…** + ────────────────────────────────────────────────────────────────────── + workspace 9 1.7845 0.58125 1.7381 … 5 … + master - - - - … 5 … + │ ╓ exp-e15bc 9 1.7845 0.58125 1.7381 … 5 … + │ ╟ 205a8d3 9 1.7845 0.58125 1.7381 … 5 … + │ ╟ dd23d96 8 1.8369 0.54173 1.7919 … 5 … + │ ╟ 5bb3a1f 7 1.8929 0.49108 1.8474 … 5 … + │ ╟ 6dc5610 6 1.951 0.43433 1.9046 … 5 … + │ ╟ a79cf29 5 2.0088 0.36837 1.9637 … 5 … + │ ╟ 5ea8327 4 2.0702 0.30388 2.025 … 5 … + │ ╟ bc0cf02 3 2.1338 0.23988 2.0883 … 5 … + │ ╟ f8cf03f 2 2.1989 0.17932 2.1542 … 5 … + │ ╟ 7575a44 1 2.2694 0.12833 2.223 … 5 … + ├─╨ a72c526 0 2.3416 0.0959 2.2955 … 5 … + ────────────────────────────────────────────────────────────────────── +``` + +After modifying the code, data, or params, the same process can be resumed. DVC +recognizes the change and shows it (see experiment `b363267`): + +```dvc +$ vi train.py # modify code +$ vi params.yaml # modify params + +$ dvc exp run +Modified checkpoint experiment based on 'exp-e15bc' will be created +... + +$ dvc exp show --no-pager --no-timestamp +``` + +```dvctable + ────────────────────────────────────────────────────────────────────────────── + **Experiment** **step** **loss** **accuracy** **val_loss** **…** **epochs** **…** + ────────────────────────────────────────────────────────────────────────────── + workspace 13 1.5841 0.69262 1.5381 … 15 … + master - - - - … 5 … + │ ╓ exp-7ff06 13 1.5841 0.69262 1.5381 … 15 … + │ ╟ 6c62fec 12 1.6325 0.67248 1.5857 … 15 … + │ ╟ 4baca3c 11 1.6817 0.64855 1.6349 … 15 … + │ ╟ b363267 (2b06de7) 10 1.7323 0.61925 1.6857 … 15 … + │ ╓ 2b06de7 9 1.7845 0.58125 1.7381 … 5 … + │ ╟ 205a8d3 9 1.7845 0.58125 1.7381 … 5 … + │ ╟ dd23d96 8 1.8369 0.54173 1.7919 … 5 … + │ ╟ 5bb3a1f 7 1.8929 0.49108 1.8474 … 5 … + │ ╟ 6dc5610 6 1.951 0.43433 1.9046 … 5 … + │ ╟ a79cf29 5 2.0088 0.36837 1.9637 … 5 … + │ ╟ 5ea8327 4 2.0702 0.30388 2.025 … 5 … + │ ╟ bc0cf02 3 2.1338 0.23988 2.0883 … 5 … + │ ╟ f8cf03f 2 2.1989 0.17932 2.1542 … 5 … + │ ╟ 7575a44 1 2.2694 0.12833 2.223 … 5 … + ├─╨ a72c526 0 2.3416 0.0959 2.2955 … 5 … + ────────────────────────────────────────────────────────────────────────────── +``` + +Sometimes you might need to train the model from scratch. The reset option +removes the checkpoint file before training: `dvc exp run --reset`. + +## Metrics logging + +Continuously logging ML metrics is a very common practice in the ML world. +Instead of a simple command-line output with the metrics values, many ML +engineers prefer visuals and plots. These plots can be organized in a "database" +of ML experiments to keep track of a project. There are many special solutions +for metrics collecting and experiment tracking such as sacred, mlflow, weight +and biases, neptune.ai, or others. + +With DVC 2.0, we are releasing a new open-source library +[DVC-Live](https://github.com/iterative/dvclive) that provides functionality for +tracking model metrics and organizing metrics in simple text files in a way that +DVC can visualize the metrics with navigation in Git history. So, DVC can show +you a metrics difference between the current model and a model in `master` or +any other branch. + +This approach is similar to the other metrics tracking tools with the difference +that Git becomes a "database" or of ML experiments. + +### Generate metrics file + +Install the library: + +```dvc +$ pip install dvclive +``` + +Instrument your code: + +```python +import dvclive +from dvclive.keras import DvcLiveCallback + +dvclive.init("logs") #, summarize=True) + +... + +model.fit(... + # Set up DVC-Live callback: + callbacks=[ DvcLiveCallback() ] + ) + +``` + +During the training you will see the metrics files that are continuously +populated each epochs: + +```dvc +$ ls logs/ +accuracy.tsv loss.tsv val_accuracy.tsv val_loss.tsv + +$ head logs/accuracy.tsv +timestamp step accuracy +1613645582716 0 0.7360000014305115 +1613645585478 1 0.8349999785423279 +1613645587322 2 0.8830000162124634 +1613645589125 3 0.9049999713897705 +1613645590891 4 0.9070000052452087 +1613645592681 5 0.9279999732971191 +1613645594490 6 0.9430000185966492 +1613645596232 7 0.9369999766349792 +1613645598034 8 0.9430000185966492 +``` + +In addition to the continuous metrics files, you will see the summary metrics +file and HTML file with the same file prefix. The summary file contains the +result of the latest epoch: + +```dvc +$ cat logs.json | python -m json.tool +{ + "step": 41, + "loss": 0.015958430245518684, + "accuracy": 0.9950000047683716, + "val_loss": 13.705962181091309, + "val_accuracy": 0.5149999856948853 +} +``` + +The HTML file contains all the visuals for continuous metrics as well as the +summary metrics on a single page: + +![](../uploads/images/2021-02-18/dvclive-html.png) + +Note, the HTML and the summary metrics files are generating automatically for +each. So, you can monitor model performance in realtime. + +### Git-navigation with the metrics file + +DVC repository is NOT required to use the live metrics functionality from the +above. It works independently from DVC. + +DVC repository becomes useful when the metrics and plots are committed in your +Git repository, and you need navigation around the metrics. + +Metrics difference between workspace and the last Git commit: + +```dvc +$ git status -s + M logs.json + M logs/accuracy.tsv + M logs/loss.tsv + M logs/val_accuracy.tsv + M logs/val_loss.tsv + M train.py +?? model.h5 + +$ dvc metrics diff --target logs.json +Path Metric Old New Change +logs.json accuracy 0.995 0.99 -0.005 +logs.json loss 0.01596 0.03036 0.0144 +logs.json step 41 36 -5 +logs.json val_accuracy 0.515 0.5175 0.0025 +logs.json val_loss 13.70596 3.29033 -10.41563 +``` + +The difference between a particular commit/branch/tag or between two commits: + +```dvc +$ dvc metrics diff --target logs.json HEAD^ 47b85c +Path Metric Old New Change +logs.json accuracy 0.995 0.998 0.003 +logs.json loss 0.01596 0.01951 0.00355 +logs.json step 41 82 41 +logs.json val_accuracy 0.515 0.51 -0.005 +logs.json val_loss 13.70596 5.83056 -7.8754 +``` + +The same Git-navigation works with the plots: + +```dvc +$ dvc plots diff --target logs +file:///Users/dmitry/src/exp-dc/plots.html +``` + +![](../uploads/images/2021-02-18/dvclive-diff-html.png) + +Another nice thing about the live metrics - they work across ML experiments and +checkpoints, if properly set up in dvc stages. To set up live metrics, you need +to specify the metrics directory in the `live` section of a stage: + +```yaml +stages: + train: + cmd: python train.py + live: + logs: + cache: false + summary: true + report: true + deps: + - data +``` + +## Thank you! + +I'd like to thank all of you DVC community members for the feedback that we are +constantly getting. This feedback helps us build new functionalities in DVC and +make it more stable. + +Please be in touch with us on [Twitter](https://twitter.com/DVCorg) and our +[Discord channel](https://dvc.org/chat). diff --git a/content/blogs/2021-02-22-cml-runner-prerelease.md b/content/blogs/2021-02-22-cml-runner-prerelease.md new file mode 100644 index 0000000000..2793f9c63e --- /dev/null +++ b/content/blogs/2021-02-22-cml-runner-prerelease.md @@ -0,0 +1,177 @@ +--- +title: + 'CML Pre-Release Notes: Automatically Train Models in the Cloud with CML 0.3.0' +date: 2021-02-22 +description: > + New features are here to make launching cloud compute for continuous + integration workflows shorter, sweeter and easier than ever. Plus, a new + GitHub Action to setup CML means more ways to use CML without our Docker + container. +descriptionLong: > + New features are here to make launching cloud compute for continuous + integration workflows shorter, sweeter and easier than ever. Plus, a new + GitHub Action to setup CML means more ways to use CML without our Docker + container. +picture: 2021-02-22/cover.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/cml-0-3-0-pre-release/685 +tags: + - CML + - GitHub Actions + - GitLab CI + - Terraform + - Release +--- + +Today, we're pre-releasing some new features in Continuous Machine Learning, or +[CML](https://cml.dev)—our open source project to adapt popular continuous +integration (CI) systems like GitHub Actions and GitLab CI for data science. CML +has become a popular tool for auto-generating ML model reports right in a GitHub +Pull Request and orchestrating resources for training models in the cloud. + +Here's what's in today's pre-release: + +## Brand new method to provision cloud compute for your CI workflows + +After the initial CML release, we found ways to significantly simplify the +process of allocating resources in CI/CD. We developed a brand new CML command +`cml runner` that hides much of the complexity of configuring and provisioning +an instance, keeping your workflows free of `bash` scripting clutter (until the +official release, docs are +[in development here](https://github.com/iterative/cml/blob/c2b96c461011f01ab2476e1542fb89d7229d150d/README.md)). +The new approach uses Terraform provider under the hood instead of Docker +Machine, as in the first version. + +Check out this example workflow to launch an EC2 instance from a GitHub Action +workflow and then train a model. We hope you'll agree it's shorter, sweeter, and +more powerful than ever! + +```yaml +name: 'Train in the cloud' +on: [push] + +jobs: + deploy-runner: + runs-on: [ubuntu-latest] + steps: + - uses: iterative/setup-cml@v1 + - uses: actions/checkout@v2 + - name: deploy + shell: bash + env: + repo_token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + run: | + cml runner \ + --cloud aws \ + --cloud-region us-west \ + --cloud-type=t2.micro \ + --labels=cml-runner + train-model: + needs: deploy-runner + runs-on: [self-hosted, cml-runner] + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: 'Train my model' + run: | + pip install -r requirements.txt + python train.py +``` + +If you use CML functions in the `train-model` step, you can go even further and +get a closed loop—sending model training results from the EC2 instance to your +pull request or merge request! For example, if we expand the `train-model` step +to incorporate functions like `cml publish` and `cml send-comment`: + +```yaml +train-model: + needs: deploy-runner + runs-on: [self-hosted, cml-runner] + container: docker://dvcorg/cml + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: 'Train a model' + env: + repo_token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} + run: | + pip install -r requirements.txt + python train.py + + echo "## Report from your EC2 Instance" > report.md + cat metrics.txt >> report.md + cml publish "plot.png" --md >> report.md + cml send-comment report.md +``` + +You'll get a pull request that looks something like this: + +![](../uploads/images/2021-02-22/sample_pr.png) + +All the code to replicate this example is up on a +[brand new demo repository](https://github.com/iterative/cml-runner-base-case). + +### Our favorite details + +The new `cml runner` function lets you turn on instances, including GPU, +high-memory and spot instances, and kick off a new workflow using the hardware +and environment of your choice—and of course, it'll turn _off_ those instances +after a configurable timeout! In the first CML release, this took +[more than 30 lines of code](https://github.com/iterative/cml_cloud_case/blob/master/.github/workflows/cml.yaml) +to configure. Now it's just one function. + +Another highlight: you can use whatever Docker container you'd like on your +instance. In the above example, we use our +[custom CML Docker container](https://github.com/iterative/cml/blob/master/Dockerfile) +(because we like it!)—but you certainly don't have to! Whatever image you +choose, we highly recommend containerizing your environment for ultimate +reproducibility and security with CML. + +You can also use the new `cml runner` function to set up a +[local self-hosted runner](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners). +On your local machine or on-premise GPU cluster, you'll install CML as a package +and then run: + +```bash +$ cml runner \ + --repo $your_project_repository_url \ + --token=$personal_access_token \ + --labels tf \ + --idle-timeout 180 +``` + +Now your machine will be listening for workflows from your project repository. + +## A New GitHub Action + +One more thing: you might've noticed in our example workflow above that there's +a [new CML GitHub Action](https://github.com/iterative/setup-cml)! The new +Action helps you setup CML, giving you one more way to mix and match the CML +suite of functions with your preferred environment. + +The new Action is designed to be a straightforward, all-in-one install that +gives you immediate use of functions like `cml publish` and `cml runner`. You'll +add this step to your workflow: + +```yaml +steps: + - uses: actions/checkout@v2 + - uses: iterative/setup-cml@v1 +``` + +[More details are in the docs!](https://github.com/iterative/setup-cml) + +## Get ready for the release + +We're inviting our community members to explore these new features in +anticipation of our upcoming, _official_ release. As always, feedback is welcome +by opening an issue on the +[CML GitHub repository](https://github.com/iterative/cml), as a comment here or +via our [Discord channel](https://discord.gg/bzA6uY7). We're excited to hear +what you think! diff --git a/content/blogs/2021-02-26-february-21-community-gems.md b/content/blogs/2021-02-26-february-21-community-gems.md new file mode 100644 index 0000000000..022be9b61a --- /dev/null +++ b/content/blogs/2021-02-26-february-21-community-gems.md @@ -0,0 +1,175 @@ +--- +title: February '21 Community Gems +date: 2021-02-26 +description: > + A roundup of technical Q&A's from the DVC community. This month: best + practices for config files, pipeline dependency management,and caching data + for CI/CD. Plus a new CML feature to launch cloud compute with Terraform! +descriptionLong: > + A roundup of technical Q&A's from the DVC community. This month: best + practices for config files, pipeline dependency management,and caching data + for CI/CD. Plus a new CML feature to launch cloud compute with Terraform! +picture: 2021-02-26/feb-gems-cover.png +author: elle_obrien +commentsUrl: https://discuss.dvc.org/t/february-21-community-gems/686 +tags: + - Community Gems + - CML + - Pipelines + - Terraform + - Conda +--- + +## DVC Questions + +### [Q: I noticed I have a DVC `config` file and a `config.local` file. What's best practice for committing these to my Git repository?](https://discord.com/channels/485586884165107732/563406153334128681/666708671333400599) + +DVC uses the `config` and `config.local` files to link your remote data +repository to your project. `config` is intended to be committed to Git, while +`config.local` is not - it's a file that you use to store sensitive information +(e.g. your personal credentials - username, password, access keys, etc. for +remote storage) or settings that are specific to your local environment. + +Usually, you don't have to worry about ensuring your `config.local` file is +being ignored by Git- the only way to create a `config.local` file is using the +`--local` flag explicitly in functions like `dvc remote` and `dvc config` +commands, so you'll know you've made one! And your `config.local` file is +`.gitignored` by default. If you're concerned, take a look and make sure there +are no settings in your `config.local` file that you actually want in your +regular `config` file. + +To learn more about `config` and `config.local`, +[read up in our docs](https://dvc.org/doc/command-reference/remote#example-add-a-default-local-remote). + +### [Q: What's the best way to install the new version of DVC in a Conda environment? I'm concerned about the `paramiko` dependency.](https://discord.com/channels/485586884165107732/563406153334128681/669173874247729165) + +When you install DVC via `conda`, it will come with dependencies like +`paramiko`. + +The only exception when installing DVC as a Python library is with `pip`: you +might want to specify the kind of remote storage you need to make sure all +dependencies are present (like `boto` for S3). You can run +`pip install "dvc[